In [1]:
# Install the hockey web scraper
#!pip install hockey_scraper

In [2]:
#import hockey_scraper

In [3]:
#full_df = hockey_scraper.scrape_seasons([2020, 2021, 2022], True, data_format='Pandas')

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, shutil              # For navigating files
from google.colab import drive # For mounting google drive

# Data Import

In [2]:
# Mount the Google Drive to access the shot data data .zip files
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Un-zip all the data files into Google Drive
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2019.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2020.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2021.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2022.zip
!unzip gdrive/MyDrive/MSDS696_Practicum_II/shots_2023.zip

Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2019.zip
  inflating: shots_2019.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2020.zip
  inflating: shots_2020.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2021.zip
  inflating: shots_2021.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2022.zip
  inflating: shots_2022.csv          
Archive:  gdrive/MyDrive/MSDS696_Practicum_II/shots_2023.zip
  inflating: shots_2023.csv          


In [4]:
# Define constants
SELECT_SEASONS = ['2019', '2020', '2021', '2022', '2023']
DATA_LOCATION = '/content'
CSV_BASE_NAME = 'shots_'

In [5]:
# Initialize list of dataframes
season_dfs = []

# Iterate through each season
for iSeason in SELECT_SEASONS:

  # Create full path to the current season's shot CSV data file
  full_path_to_csv = os.path.join(DATA_LOCATION, CSV_BASE_NAME + iSeason + '.csv')

  # Create dataframe for current season and append to the running list
  season_dfs.append(pd.read_csv(full_path_to_csv))

# Concatenate all dataframes into one
shot_df = pd.concat(season_dfs, ignore_index=True)

# Data Cleaning

In [6]:
shot_df.describe()

Unnamed: 0,shotID,arenaAdjustedShotDistance,arenaAdjustedXCord,arenaAdjustedXCordABS,arenaAdjustedYCord,arenaAdjustedYCordAbs,averageRestDifference,awayEmptyNet,awayPenalty1Length,awayPenalty1TimeLeft,...,penaltyLength,playoffGame,roadTeamCode,roadTeamScore,shotGoalProbability,shotPlayContinued,timeBetweenEvents,timeLeft,wentToOT,wentToShootout
count,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,53489.831281,34.854274,-0.447524,60.108505,-0.050768,15.98368,-2.607372,0.013615,13.737586,6.486518,...,,,,,,,,,,
std,32236.669789,19.387778,63.065518,19.089801,19.754164,11.608234,12.616024,0.115885,40.431918,22.714443,...,,,,,,,,,,
min,0.0,1.0,-99.0,0.0,-52.0,0.0,-172.6,0.0,0.0,0.0,...,,,,,,,,,,
25%,26046.0,18.0,-63.0,46.0,-15.0,5.0,-6.6,0.0,0.0,0.0,...,,,,,,,,,,
50%,52231.0,34.0,-2.0,63.0,0.0,15.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
75%,78416.0,49.091751,62.0,76.0,14.0,25.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
max,121923.0,99.0,100.0,100.0,54.0,54.0,126.4,1.0,600.0,595.0,...,,,,,,,,,,


In [None]:
# Get all column names
cols = shot_df.columns
for iCol in range(cols.size):
  print(cols[iCol])

In [16]:
# List out all the columns that we wish to analyze moving forward
cols_to_keep = ['arenaAdjustedShotDistance', 'arenaAdjustedXCordABS', 'arenaAdjustedYCordAbs', 'awayEmptyNet', 'awaySkatersOnIce', 'awayTeamCode', 'event', 'homeEmptyNet', 'homeSkatersOnIce', 'homeTeamCode', 'isHomeTeam', 'lastEventCategory', 'shotAngleAdjusted', 'shotAngleReboundRoyalRoad', 'shotRebound', 'shotRush', 'timeSinceLastEvent']

In [25]:
# Remove all other, nonessential columns from the dataframe
for iCol in shot_df.columns:
  if iCol not in cols_to_keep:
    shot_df = shot_df.drop(iCol, axis=1)

In [None]:
# Remove all shots from games that do not involve the CO Avalanche
index_non_avs = shot_df[(shot_df['Age'] >= 20) & (df['Age'] <= 25) ].index

In [30]:
shot_df.head()

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCordABS,arenaAdjustedYCordAbs,awayEmptyNet,awaySkatersOnIce,homeEmptyNet,homeSkatersOnIce,isHomeTeam,shotAngleAdjusted,shotAngleReboundRoyalRoad,shotRebound,shotRush,timeSinceLastEvent
count,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0,523699.0
mean,34.854274,60.108505,15.98368,0.013615,4.886062,0.013531,4.894218,0.513087,31.391882,0.097168,0.061652,0.001846,15.572814
std,19.387778,19.089801,11.608234,0.115885,0.39641,0.115532,0.386012,0.499829,20.793601,0.296187,0.240522,0.042931,15.088564
min,1.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,46.0,5.0,0.0,5.0,0.0,5.0,0.0,15.255119,0.0,0.0,0.0,5.0
50%,34.0,63.0,15.0,0.0,5.0,0.0,5.0,1.0,29.357754,0.0,0.0,0.0,11.0
75%,49.091751,76.0,25.0,0.0,5.0,0.0,5.0,1.0,45.0,0.0,0.0,0.0,21.0
max,99.0,100.0,54.0,1.0,10.0,1.0,8.0,1.0,88.451842,1.0,1.0,1.0,234.0
