In [1]:
# Install the hockey web scraper
#!pip install hockey_scraper

In [2]:
#import hockey_scraper

In [3]:
#full_df = hockey_scraper.scrape_seasons([2020, 2021, 2022], True, data_format='Pandas')

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, shutil              # For navigating files
from google.colab import drive # For mounting google drive

# Data Import

In [5]:
# Mount the Google Drive to access the shot data data .zip files
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
# Un-zip all the data files into Google Drive
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2019.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2020.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2021.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2022.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2023.zip

Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2019.zip
replace shots_2019.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2020.zip
replace shots_2020.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2021.zip
replace shots_2021.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2022.zip
replace shots_2022.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2023.zip
replace shots_2023.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [7]:
# Define constants
SELECT_SEASONS = ['2019', '2020', '2021', '2022', '2023']
DATA_LOCATION = '/content'
CSV_BASE_NAME = 'shots_'

In [30]:
# Initialize list of dataframes
season_dfs = []

# Iterate through each season
for iSeason in SELECT_SEASONS:

  # Create full path to the current season's shot CSV data file
  full_path_to_csv = os.path.join(DATA_LOCATION, CSV_BASE_NAME + iSeason + '.csv')

  # Create dataframe for current season and append to the running list
  season_dfs.append(pd.read_csv(full_path_to_csv))

# Concatenate all dataframes into one
shot_df = pd.concat(season_dfs, ignore_index=True)

# Data Cleaning

In [9]:
shot_df.describe()

Unnamed: 0,shotID,arenaAdjustedShotDistance,arenaAdjustedXCord,arenaAdjustedXCordABS,arenaAdjustedYCord,arenaAdjustedYCordAbs,averageRestDifference,awayEmptyNet,awayPenalty1Length,awayPenalty1TimeLeft,...,penaltyLength,playoffGame,roadTeamCode,roadTeamScore,shotGoalProbability,shotPlayContinued,timeBetweenEvents,timeLeft,wentToOT,wentToShootout
count,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,53489.831281,34.854274,-0.447524,60.108505,-0.050768,15.98368,-2.607372,0.013615,13.737586,6.486518,...,,,,,,,,,,
std,32236.669789,19.387778,63.065518,19.089801,19.754164,11.608234,12.616024,0.115885,40.431918,22.714443,...,,,,,,,,,,
min,0.0,1.0,-99.0,0.0,-52.0,0.0,-172.6,0.0,0.0,0.0,...,,,,,,,,,,
25%,26046.0,18.0,-63.0,46.0,-15.0,5.0,-6.6,0.0,0.0,0.0,...,,,,,,,,,,
50%,52231.0,34.0,-2.0,63.0,0.0,15.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
75%,78416.0,49.091751,62.0,76.0,14.0,25.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
max,121923.0,99.0,100.0,100.0,54.0,54.0,126.4,1.0,600.0,595.0,...,,,,,,,,,,


In [31]:
# Get all column names
cols = shot_df.columns
for iCol in range(cols.size):
  print(cols[iCol])

shotID
arenaAdjustedShotDistance
arenaAdjustedXCord
arenaAdjustedXCordABS
arenaAdjustedYCord
arenaAdjustedYCordAbs
averageRestDifference
awayEmptyNet
awayPenalty1Length
awayPenalty1TimeLeft
awaySkatersOnIce
awayTeamCode
awayTeamGoals
defendingTeamAverageTimeOnIce
defendingTeamAverageTimeOnIceOfDefencemen
defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff
defendingTeamAverageTimeOnIceOfForwards
defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff
defendingTeamAverageTimeOnIceSinceFaceoff
defendingTeamDefencemenOnIce
defendingTeamForwardsOnIce
defendingTeamMaxTimeOnIce
defendingTeamMaxTimeOnIceOfDefencemen
defendingTeamMaxTimeOnIceOfDefencemenSinceFaceoff
defendingTeamMaxTimeOnIceOfForwards
defendingTeamMaxTimeOnIceOfForwardsSinceFaceoff
defendingTeamMaxTimeOnIceSinceFaceoff
defendingTeamMinTimeOnIce
defendingTeamMinTimeOnIceOfDefencemen
defendingTeamMinTimeOnIceOfDefencemenSinceFaceoff
defendingTeamMinTimeOnIceOfForwards
defendingTeamMinTimeOnIceOfForwardsSinceFaceoff
defendingTeamMin

In [32]:
# List out all the columns that we wish to analyze moving forward
cols_to_keep = ['arenaAdjustedShotDistance', 'arenaAdjustedXCordABS', 'arenaAdjustedYCordAbs', 'awaySkatersOnIce', 'awayTeamCode', 'event', 'homeSkatersOnIce', 'homeTeamCode', 'isHomeTeam', 'lastEventCategory', 'offWing', 'shootingTeamAverageTimeOnIce', 'shotAngleAdjusted', 'shotAngleReboundRoyalRoad', 'shotOnEmptyNet', 'shotRebound', 'shotRush', 'shotType', 'speedFromLastEvent', 'timeSinceLastEvent']

In [33]:
# Remove all other, nonessential columns from the dataframe
for iCol in shot_df.columns:
  if iCol not in cols_to_keep:
    shot_df.drop(iCol, axis=1, inplace=True)

In [34]:
# Remove all shots from games that do not involve the CO Avalanche
index_non_avs = shot_df[(shot_df['awayTeamCode'] != 'COL') & (shot_df['homeTeamCode'] != 'COL')].index
shot_df.drop(index_non_avs, inplace=True)
shot_df.head(15)

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCordABS,arenaAdjustedYCordAbs,awaySkatersOnIce,awayTeamCode,event,homeSkatersOnIce,homeTeamCode,isHomeTeam,lastEventCategory,offWing,shootingTeamAverageTimeOnIce,shotAngleAdjusted,shotAngleReboundRoyalRoad,shotOnEmptyNet,shotRebound,shotRush,shotType,speedFromLastEvent,timeSinceLastEvent
884,30.0,81.0,29.0,4,CGY,SHOT,5,COL,1.0,FAC,0,30.0,74.931512,0,0,0,0,WRIST,2.072053,24
885,6.324555,83.0,2.0,4,CGY,MISS,5,COL,1.0,SHOT,0,45.0,18.434949,1,0,0,0,TIP,1.867857,15
886,36.0,59.0,19.0,4,CGY,MISS,5,COL,1.0,HIT,0,60.0,32.735226,0,0,0,0,WRIST,2.443233,14
887,40.0,56.0,23.0,5,CGY,SHOT,5,COL,1.0,BLOCK,1,67.0,35.362462,0,0,0,0,WRIST,2.483079,13
888,5.385165,87.0,5.0,5,CGY,SHOT,5,COL,1.0,SHOT,1,77.0,68.198591,1,0,0,0,WRIST,3.962323,10
889,62.0,36.0,32.0,5,CGY,MISS,5,COL,1.0,FAC,0,11.0,31.429566,0,0,0,0,WRIST,4.466892,8
890,22.0,88.0,22.0,5,CGY,SHOT,5,COL,1.0,HIT,1,31.6,87.273689,0,0,0,0,WRIST,23.493683,8
891,33.0,84.0,33.0,5,CGY,MISS,6,COL,1.0,SHOT,0,30.166667,80.537678,0,0,0,0,WRIST,2.573908,4
892,15.0,85.0,14.0,4,CGY,MISS,5,COL,1.0,HIT,1,61.8,72.897271,0,0,0,0,WRIST,10.0,2
893,28.0,62.0,7.0,4,CGY,GOAL,5,COL,1.0,MISS,0,67.0,14.036243,0,0,0,0,TIP,0.2616,81


In [35]:
# Remove empty net shots from the dataframe, as skew shot goal results are skewed when no goalie is present
index_empty_net = shot_df[(shot_df['shotOnEmptyNet'] == 1)].index
shot_df.drop(index_empty_net, inplace=True)

In [36]:
# Create new feature for whether or not this is an Avs home game
shot_df['isHomeGame'] = np.where(shot_df['homeTeamCode'] == 'COL', 1, 0)

In [16]:
print(shot_df['isHomeTeam'].unique())

[1. 0.]


In [37]:
# Remove home game shots that were taken by team that was opposing the Avalanche
index_other_team_shots = shot_df[(shot_df['isHomeGame'] == True) & (shot_df['isHomeTeam'] < 0.5)].index
shot_df.drop(index_other_team_shots, inplace=True)

In [38]:
# Repeat the above removal for Avalanche away games
index_other_team_shots = shot_df[(shot_df['isHomeGame'] == False) & (shot_df['isHomeTeam'] > 0.5)].index
shot_df.drop(index_other_team_shots, inplace=True)

In [39]:
shot_df.head(15)

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCordABS,arenaAdjustedYCordAbs,awaySkatersOnIce,awayTeamCode,event,homeSkatersOnIce,homeTeamCode,isHomeTeam,lastEventCategory,...,shootingTeamAverageTimeOnIce,shotAngleAdjusted,shotAngleReboundRoyalRoad,shotOnEmptyNet,shotRebound,shotRush,shotType,speedFromLastEvent,timeSinceLastEvent,isHomeGame
884,30.0,81.0,29.0,4,CGY,SHOT,5,COL,1.0,FAC,...,30.0,74.931512,0,0,0,0,WRIST,2.072053,24,1
885,6.324555,83.0,2.0,4,CGY,MISS,5,COL,1.0,SHOT,...,45.0,18.434949,1,0,0,0,TIP,1.867857,15,1
886,36.0,59.0,19.0,4,CGY,MISS,5,COL,1.0,HIT,...,60.0,32.735226,0,0,0,0,WRIST,2.443233,14,1
887,40.0,56.0,23.0,5,CGY,SHOT,5,COL,1.0,BLOCK,...,67.0,35.362462,0,0,0,0,WRIST,2.483079,13,1
888,5.385165,87.0,5.0,5,CGY,SHOT,5,COL,1.0,SHOT,...,77.0,68.198591,1,0,0,0,WRIST,3.962323,10,1
889,62.0,36.0,32.0,5,CGY,MISS,5,COL,1.0,FAC,...,11.0,31.429566,0,0,0,0,WRIST,4.466892,8,1
890,22.0,88.0,22.0,5,CGY,SHOT,5,COL,1.0,HIT,...,31.6,87.273689,0,0,0,0,WRIST,23.493683,8,1
891,33.0,84.0,33.0,5,CGY,MISS,6,COL,1.0,SHOT,...,30.166667,80.537678,0,0,0,0,WRIST,2.573908,4,1
892,15.0,85.0,14.0,4,CGY,MISS,5,COL,1.0,HIT,...,61.8,72.897271,0,0,0,0,WRIST,10.0,2,1
893,28.0,62.0,7.0,4,CGY,GOAL,5,COL,1.0,MISS,...,67.0,14.036243,0,0,0,0,TIP,0.2616,81,1


In [40]:
shot_df.shape[0]

18852

In [41]:
# Check for indices corresponding to missing values in the dataframe
np.where(pd.isnull(shot_df))

(array([ 1332, 11650, 12362, 12564, 12725, 12923, 12935, 13344, 13367,
        13502, 13696, 14344, 14640, 14666, 14774, 15188, 15229, 15251,
        15378, 15405, 15721, 15779, 15811, 16106, 16285, 16816, 16876,
        17094, 17322, 17399, 17587, 17725, 17801, 17976, 18099, 18109,
        18183, 18237, 18399, 18402, 18514, 18785, 18812, 18827]),
 array([17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
        17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
        17, 17, 17, 17, 17, 17, 17, 17, 17, 17]))

In [42]:
# Fill the missing shot types with wrist shots, as they are the most common
shot_df['shotType'].fillna('WRIST', inplace=True)

In [43]:
# Verify the wrist shots were added for missing values
print(shot_df['shotType'].unique())

['WRIST' 'TIP' 'SLAP' 'BACK' 'SNAP' 'DEFL' 'WRAP']


In [44]:
shot_df.head(15)

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCordABS,arenaAdjustedYCordAbs,awaySkatersOnIce,awayTeamCode,event,homeSkatersOnIce,homeTeamCode,isHomeTeam,lastEventCategory,...,shootingTeamAverageTimeOnIce,shotAngleAdjusted,shotAngleReboundRoyalRoad,shotOnEmptyNet,shotRebound,shotRush,shotType,speedFromLastEvent,timeSinceLastEvent,isHomeGame
884,30.0,81.0,29.0,4,CGY,SHOT,5,COL,1.0,FAC,...,30.0,74.931512,0,0,0,0,WRIST,2.072053,24,1
885,6.324555,83.0,2.0,4,CGY,MISS,5,COL,1.0,SHOT,...,45.0,18.434949,1,0,0,0,TIP,1.867857,15,1
886,36.0,59.0,19.0,4,CGY,MISS,5,COL,1.0,HIT,...,60.0,32.735226,0,0,0,0,WRIST,2.443233,14,1
887,40.0,56.0,23.0,5,CGY,SHOT,5,COL,1.0,BLOCK,...,67.0,35.362462,0,0,0,0,WRIST,2.483079,13,1
888,5.385165,87.0,5.0,5,CGY,SHOT,5,COL,1.0,SHOT,...,77.0,68.198591,1,0,0,0,WRIST,3.962323,10,1
889,62.0,36.0,32.0,5,CGY,MISS,5,COL,1.0,FAC,...,11.0,31.429566,0,0,0,0,WRIST,4.466892,8,1
890,22.0,88.0,22.0,5,CGY,SHOT,5,COL,1.0,HIT,...,31.6,87.273689,0,0,0,0,WRIST,23.493683,8,1
891,33.0,84.0,33.0,5,CGY,MISS,6,COL,1.0,SHOT,...,30.166667,80.537678,0,0,0,0,WRIST,2.573908,4,1
892,15.0,85.0,14.0,4,CGY,MISS,5,COL,1.0,HIT,...,61.8,72.897271,0,0,0,0,WRIST,10.0,2,1
893,28.0,62.0,7.0,4,CGY,GOAL,5,COL,1.0,MISS,...,67.0,14.036243,0,0,0,0,TIP,0.2616,81,1


In [49]:
print(shot_df['lastEventCategory'].unique())

['FAC' 'SHOT' 'HIT' 'BLOCK' 'MISS' 'GIVE' 'TAKE' 'DELPEN']


In [46]:
# Create the label column for whether or not the Avalanche shot resulted in a goal
shot_df['isGoal'] = np.where(shot_df['event'] == 'GOAL', 1, 0)

In [47]:
# Remove features that are no longer necessary
shot_df.drop('event', axis=1, inplace=True)
shot_df.drop('shotOnEmptyNet', axis=1, inplace=True)

In [85]:
# One-hot encode the shorthanded shots using the number of Avs and opposing players on the ice
shot_df['isShortSided'] = shot_df.apply(lambda x: 1 if ((x['isHomeGame'] == 1 and (x['homeSkatersOnIce'] < x['awaySkatersOnIce'])) or
                                                        (x['isHomeGame'] == 0 and (x['homeSkatersOnIce'] > x['awaySkatersOnIce']))) else 0, axis=1)

# Repeat for even strength matchups
shot_df['isEvenStrength'] = shot_df.apply(lambda x: 1 if (x['homeSkatersOnIce'] == x['awaySkatersOnIce']) else 0, axis=1)

# Repeat for power-plays with one less opposing skater than Avs skaters on the ice
shot_df['isPowerPlay'] = shot_df.apply(lambda x: 1 if ((x['isHomeGame'] == 1 and (x['homeSkatersOnIce'] == (x['awaySkatersOnIce'] + 1))) or
                                                       (x['isHomeGame'] == 0 and (x['homeSkatersOnIce'] == (x['awaySkatersOnIce'] - 1)))) else 0, axis=1)

# Repeat for power-plays with at least two less opposing skater than Avs skaters on the ice
shot_df['isExtraPowerPlay'] = shot_df.apply(lambda x: 1 if ((x['isHomeGame'] == 1 and (x['homeSkatersOnIce'] >= (x['awaySkatersOnIce'] + 2))) or
                                                            (x['isHomeGame'] == 0 and (x['homeSkatersOnIce'] <= (x['awaySkatersOnIce'] - 2)))) else 0, axis=1)

In [87]:
# Remove features that are no longer necessary
shot_df.drop('awaySkatersOnIce', axis=1, inplace=True)
shot_df.drop('homeSkatersOnIce', axis=1, inplace=True)
shot_df.drop('awayTeamCode', axis=1, inplace=True)
shot_df.drop('homeTeamCode', axis=1, inplace=True)
shot_df.drop('isHomeTeam', axis=1, inplace=True)

In [88]:
shot_df.head(15)

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCordABS,arenaAdjustedYCordAbs,lastEventCategory,offWing,shootingTeamAverageTimeOnIce,shotAngleAdjusted,shotAngleReboundRoyalRoad,shotRebound,shotRush,shotType,speedFromLastEvent,timeSinceLastEvent,isHomeGame,isGoal,isShortSided,isEvenStrength,isPowerPlay,isExtraPowerPlay
884,30.0,81.0,29.0,FAC,0,30.0,74.931512,0,0,0,WRIST,2.072053,24,1,0,0,0,1,0
885,6.324555,83.0,2.0,SHOT,0,45.0,18.434949,1,0,0,TIP,1.867857,15,1,0,0,0,1,0
886,36.0,59.0,19.0,HIT,0,60.0,32.735226,0,0,0,WRIST,2.443233,14,1,0,0,0,1,0
887,40.0,56.0,23.0,BLOCK,1,67.0,35.362462,0,0,0,WRIST,2.483079,13,1,0,0,1,0,0
888,5.385165,87.0,5.0,SHOT,1,77.0,68.198591,1,0,0,WRIST,3.962323,10,1,0,0,1,0,0
889,62.0,36.0,32.0,FAC,0,11.0,31.429566,0,0,0,WRIST,4.466892,8,1,0,0,1,0,0
890,22.0,88.0,22.0,HIT,1,31.6,87.273689,0,0,0,WRIST,23.493683,8,1,0,0,1,0,0
891,33.0,84.0,33.0,SHOT,0,30.166667,80.537678,0,0,0,WRIST,2.573908,4,1,0,0,0,1,0
892,15.0,85.0,14.0,HIT,1,61.8,72.897271,0,0,0,WRIST,10.0,2,1,0,0,0,1,0
893,28.0,62.0,7.0,MISS,0,67.0,14.036243,0,0,0,TIP,0.2616,81,1,1,0,0,1,0


In [None]:
# TO DO: Make new feature that indicates of the last event was a shot (missed or on goal)

# TO DO: Remove the last event category feature