In [1]:
# Install the hockey web scraper
#!pip install hockey_scraper

In [2]:
#import hockey_scraper

In [3]:
#full_df = hockey_scraper.scrape_seasons([2020, 2021, 2022], True, data_format='Pandas')

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, shutil              # For navigating files
from google.colab import drive # For mounting google drive

# Data Import

In [2]:
# Mount the Google Drive to access the shot data data .zip files
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Un-zip all the data files into Google Drive
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2019.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2020.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2021.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2022.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2023.zip

Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2019.zip
  inflating: shots_2019.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2020.zip
  inflating: shots_2020.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2021.zip
  inflating: shots_2021.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2022.zip
  inflating: shots_2022.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2023.zip
  inflating: shots_2023.csv          


In [4]:
# Define constants
SELECT_SEASONS = ['2019', '2020', '2021', '2022', '2023']
DATA_LOCATION = '/content'
CSV_BASE_NAME = 'shots_'

In [5]:
# Initialize list of dataframes
season_dfs = []

# Iterate through each season
for iSeason in SELECT_SEASONS:

  # Create full path to the current season's shot CSV data file
  full_path_to_csv = os.path.join(DATA_LOCATION, CSV_BASE_NAME + iSeason + '.csv')

  # Create dataframe for current season and append to the running list
  season_dfs.append(pd.read_csv(full_path_to_csv))

# Concatenate all dataframes into one
shot_df = pd.concat(season_dfs, ignore_index=True)

# Data Cleaning

In [6]:
shot_df.describe()

Unnamed: 0,shotID,arenaAdjustedShotDistance,arenaAdjustedXCord,arenaAdjustedXCordABS,arenaAdjustedYCord,arenaAdjustedYCordAbs,averageRestDifference,awayEmptyNet,awayPenalty1Length,awayPenalty1TimeLeft,...,penaltyLength,playoffGame,roadTeamCode,roadTeamScore,shotGoalProbability,shotPlayContinued,timeBetweenEvents,timeLeft,wentToOT,wentToShootout
count,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,53489.831281,34.854274,-0.447524,60.108505,-0.050768,15.98368,-2.607372,0.013615,13.737586,6.486518,...,,,,,,,,,,
std,32236.669789,19.387778,63.065518,19.089801,19.754164,11.608234,12.616024,0.115885,40.431918,22.714443,...,,,,,,,,,,
min,0.0,1.0,-99.0,0.0,-52.0,0.0,-172.6,0.0,0.0,0.0,...,,,,,,,,,,
25%,26046.0,18.0,-63.0,46.0,-15.0,5.0,-6.6,0.0,0.0,0.0,...,,,,,,,,,,
50%,52231.0,34.0,-2.0,63.0,0.0,15.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
75%,78416.0,49.091751,62.0,76.0,14.0,25.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
max,121923.0,99.0,100.0,100.0,54.0,54.0,126.4,1.0,600.0,595.0,...,,,,,,,,,,


In [None]:
# Get all column names
cols = shot_df.columns
for iCol in range(cols.size):
  print(cols[iCol])

In [16]:
# List out all the columns that we wish to analyze moving forward
cols_to_keep = ['arenaAdjustedShotDistance', 'arenaAdjustedXCordABS', 'arenaAdjustedYCordAbs', 'awayEmptyNet', 'awaySkatersOnIce', 'awayTeamCode', 'event', 'homeEmptyNet', 'homeSkatersOnIce', 'homeTeamCode', 'isHomeTeam', 'lastEventCategory', 'shootingTeamAverageTimeOnIce', 'shotAngleAdjusted', 'shotAngleReboundRoyalRoad', 'shotRebound', 'shotRush', 'shotType', 'speedFromLastEvent', 'timeSinceLastEvent']

In [25]:
# Remove all other, nonessential columns from the dataframe
for iCol in shot_df.columns:
  if iCol not in cols_to_keep:
    shot_df.drop(iCol, axis=1, inplace=True)

In [33]:
# Remove all shots from games that do not involve the CO Avalanche
index_non_avs = shot_df[(shot_df['awayTeamCode'] != 'COL') & (shot_df['homeTeamCode'] != 'COL')].index
shot_df.drop(index_non_avs, inplace=True)
shot_df.head(15)

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCordABS,arenaAdjustedYCordAbs,awayEmptyNet,awaySkatersOnIce,awayTeamCode,event,homeEmptyNet,homeSkatersOnIce,homeTeamCode,isHomeTeam,lastEventCategory,shotAngleAdjusted,shotAngleReboundRoyalRoad,shotRebound,shotRush,timeSinceLastEvent
884,30.0,81.0,29.0,0,4,CGY,SHOT,0,5,COL,1.0,FAC,74.931512,0,0,0,24
885,6.324555,83.0,2.0,0,4,CGY,MISS,0,5,COL,1.0,SHOT,18.434949,1,0,0,15
886,36.0,59.0,19.0,0,4,CGY,MISS,0,5,COL,1.0,HIT,32.735226,0,0,0,14
887,40.0,56.0,23.0,0,5,CGY,SHOT,0,5,COL,1.0,BLOCK,35.362462,0,0,0,13
888,5.385165,87.0,5.0,0,5,CGY,SHOT,0,5,COL,1.0,SHOT,68.198591,1,0,0,10
889,62.0,36.0,32.0,0,5,CGY,MISS,0,5,COL,1.0,FAC,31.429566,0,0,0,8
890,22.0,88.0,22.0,0,5,CGY,SHOT,0,5,COL,1.0,HIT,87.273689,0,0,0,8
891,33.0,84.0,33.0,0,5,CGY,MISS,1,6,COL,1.0,SHOT,80.537678,0,0,0,4
892,15.0,85.0,14.0,0,4,CGY,MISS,0,5,COL,1.0,HIT,72.897271,0,0,0,2
893,28.0,62.0,7.0,0,4,CGY,GOAL,0,5,COL,1.0,MISS,14.036243,0,0,0,81


In [34]:
shot_df.shape[0]

35782

In [None]:
# TO DO list:
# Remove empty net shots from df
# Remove shots that were taken by team that was opposing the Avs
# Remove shots that were not on goal? Or keep these in? If keeping them, need to change 'event' to 0 (no goal) and 1 (goal)