# Data Prep
This notebook is designed to get the 3 large datasets small enough to live in a git repository without needing git LFS.

**IF YOU ARE FOLLOWING ALONG, YOU DON'T NEED TO RUN THIS. THIS IS WHAT PRODUCES THE DATA IN THE `/Data` DIRECTORY**

The approach taken is to omit any data that did not occur in 2018 from the data files. 2018 was chosen to be after the addition of the Golden Knights and during a time in which we had `game_penalties.csv` data for that year (2018 is the last year of availability for that and Vegas came about in 2017)

## Download Data for Prep
We want to start with the full dataset which I've zipped up and stored in Azure Blob Storage.

In [20]:
import os

extract_directory = "../DataPrep"

# ensure the extract directory exists
if not os.path.exists(extract_directory):
    os.makedirs(extract_directory)

In [21]:
import requests

# Download the zip file
url = "https://longtermpublic.blob.core.windows.net/data/NHLData.zip"
print("Downloading data file...")
response = requests.get(url)

# Save the zip file
print("Saving data zip file...")
zip_file_path = extract_directory + "/NHLData.zip"
with open(zip_file_path, "wb") as file:
    file.write(response.content)

print("Downloaded complete.")

Downloading data file...
Saving data zip file...
Downloaded complete.


In [22]:
import zipfile

print('Extracting data files...')

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_directory)

print('Done extracting files')

Extracting data files...
Done extracting files


## Data Downsizing

In [23]:
import pandas as pd
import numpy as np

data_directory = '../Data'
year = 2018

In [24]:
# Lets start with game since it has the date
dfGame = pd.read_csv(extract_directory + "/game.csv")

# Add a year column
dfGame['date_time_GMT'] = pd.to_datetime(dfGame['date_time_GMT'])
dfGame['year'] = dfGame['date_time_GMT'].dt.year

dfGame = dfGame[dfGame['year'] == year]

# Save the game data
dfGame.to_csv(data_directory + "/game.csv", index=False)

In [25]:
# Let's go to game_plays next since it's the largest
dfGamePlays = pd.read_csv(extract_directory + "/game_plays.csv")
dfGamePlays['year'] = dfGamePlays['game_id'].astype(str).str[:4].astype(int)
dfGamePlays = dfGamePlays[dfGamePlays['year'] == year]
dfGamePlays.drop(columns=['year'], inplace=True)
dfGamePlays.to_csv(data_directory + "/game_plays.csv", index=False)

In [26]:
# Now we'll hit game_shifts
dfGameShifts = pd.read_csv(extract_directory + "/game_shifts.csv")
dfGameShifts['year'] = dfGameShifts['game_id'].astype(str).str[:4].astype(int)
dfGameShifts = dfGameShifts[dfGameShifts['year'] == year]
dfGameShifts.drop(columns=['year'], inplace=True)
dfGameShifts.to_csv(data_directory + "/game_shifts.csv", index=False)

In [27]:
# Next game plays players
dfGamePlaysPlayers = pd.read_csv(extract_directory + "/game_plays_players.csv")
dfGamePlaysPlayers['year'] = dfGamePlaysPlayers['game_id'].astype(str).str[:4].astype(int)
dfGamePlaysPlayers = dfGamePlaysPlayers[dfGamePlaysPlayers['year'] == year]
dfGamePlaysPlayers.drop(columns=['year'], inplace=True)

dfGamePlaysPlayers.to_csv(data_directory + "/game_plays_players.csv", index=False)

In [28]:
# Game Skater Stats
dfGameSkaterStats = pd.read_csv(extract_directory + "/game_skater_stats.csv")
dfGameSkaterStats['year'] = dfGameSkaterStats['game_id'].astype(str).str[:4].astype(int)
dfGameSkaterStats = dfGameSkaterStats[dfGameSkaterStats['year'] == year]
dfGameSkaterStats.drop(columns=['year'], inplace=True)
dfGameSkaterStats.to_csv(data_directory + "/game_skater_stats.csv", index=False)

In [29]:
# Game penalties
dfGamePenalties = pd.read_csv(extract_directory + "/game_penalties.csv")
dfGamePenalties['year'] = dfGamePenalties['play_id'].astype(str).str[:4].astype(int)
dfGamePenalties = dfGamePenalties[dfGamePenalties['year'] == year]
dfGamePenalties.drop(columns=['year'], inplace=True)
dfGamePenalties.to_csv(data_directory + "/game_penalties.csv", index=False)

In [30]:
# Game goals
dfGameGoals = pd.read_csv(extract_directory + "/game_goals.csv")
dfGameGoals['year'] = dfGameGoals['play_id'].astype(str).str[:4].astype(int)
dfGameGoals = dfGameGoals[dfGameGoals['year'] == year]
dfGameGoals.drop(columns=['year'], inplace=True)
dfGameGoals.to_csv(data_directory + "/game_goals.csv", index=False)

  dfGameGoals = pd.read_csv(extract_directory + "/game_goals.csv")


In [31]:
# Game Scratches
dfGameScratches = pd.read_csv(extract_directory + "/game_scratches.csv")
dfGameScratches['year'] = dfGameScratches['game_id'].astype(str).str[:4].astype(int)
dfGameScratches = dfGameScratches[dfGameScratches['year'] == year]
dfGameScratches.drop(columns=['year'], inplace=True)
dfGameScratches.to_csv(data_directory + "/game_scratches.csv", index=False)

In [32]:
# Player Info
dfPlayerInfo = pd.read_csv(extract_directory + "/player_info.csv")
dfPlayerInfo.to_csv(data_directory + "/player_info.csv", index=False)

In [33]:
# Team Info
dfTeamInfo = pd.read_csv(extract_directory + "/team_info.csv")
dfTeamInfo.to_csv(data_directory + "/team_info.csv", index=False)

In [34]:
# Game Officials
dfGameOfficials = pd.read_csv(extract_directory + "/game_officials.csv")
dfGameOfficials['year'] = dfGameOfficials['game_id'].astype(str).str[:4].astype(int)
dfGameOfficials = dfGameOfficials[dfGameOfficials['year'] == year]
dfGameOfficials.drop(columns=['year'], inplace=True)
dfGameOfficials.to_csv(data_directory + "/game_officials.csv", index=False)

In [35]:
# Game Team Stats
dfGameTeamStats = pd.read_csv(extract_directory + "/game_teams_stats.csv")
dfGameTeamStats['year'] = dfGameTeamStats['game_id'].astype(str).str[:4].astype(int)
dfGameTeamStats = dfGameTeamStats[dfGameTeamStats['year'] == year]
dfGameTeamStats.drop(columns=['year'], inplace=True)
dfGameTeamStats.to_csv(data_directory + "/game_teams_stats.csv", index=False)

In [36]:
# Game Goalie Stats
dfGameGoalieStats = pd.read_csv(extract_directory + "/game_goalie_stats.csv")
dfGameGoalieStats['year'] = dfGameGoalieStats['game_id'].astype(str).str[:4].astype(int)
dfGameGoalieStats = dfGameGoalieStats[dfGameGoalieStats['year'] == year]
dfGameGoalieStats.drop(columns=['year'], inplace=True)
dfGameGoalieStats.to_csv(data_directory + "/game_goalie_stats.csv", index=False)

In [37]:
# Game Skater Stats
dfGameSkaterStats = pd.read_csv(extract_directory + "/game_skater_stats.csv")
dfGameSkaterStats['year'] = dfGameSkaterStats['game_id'].astype(str).str[:4].astype(int)
dfGameSkaterStats = dfGameSkaterStats[dfGameSkaterStats['year'] == year]
dfGameSkaterStats.drop(columns=['year'], inplace=True)
dfGameSkaterStats.to_csv(data_directory + "/game_skater_stats.csv", index=False)