## Data retrieval and cleaning

Don't forget to delete your `json_clean` if you make any modifications `cleaning.py`!


In [19]:
from ift6758.data.acquisition import NHLGameData

data_path_raw = './../../ift6758/data/json_raw/'
nhl_games_data = NHLGameData(data_path_raw)
for year in range(2016,2021):
    nhl_games_data.fetch_season(year)

Loading from cache file ./../../ift6758/data/json_raw/2016/2016-regular.pkl
Found 1230 regular games for season 2016-2017
Loading from cache file ./../../ift6758/data/json_raw/2016/2016-playoff.pkl
Found 102 playoff games for season 2016-2017
Loading from cache file ./../../ift6758/data/json_raw/2017/2017-regular.pkl
Found 1271 regular games for season 2017-2018
Loading from cache file ./../../ift6758/data/json_raw/2017/2017-playoff.pkl
Found 105 playoff games for season 2017-2018
Loading from cache file ./../../ift6758/data/json_raw/2018/2018-regular.pkl
Found 1271 regular games for season 2018-2019
Loading from cache file ./../../ift6758/data/json_raw/2018/2018-playoff.pkl
Found 105 playoff games for season 2018-2019
Loading from cache file ./../../ift6758/data/json_raw/2019/2019-regular.pkl
Found 1271 regular games for season 2019-2020
Loading from cache file ./../../ift6758/data/json_raw/2019/2019-playoff.pkl
Found 105 playoff games for season 2019-2020
Loading from cache file ./..

In [20]:
from ift6758.data.cleaning import DataCleaner

data_path_clean = './../../ift6758/data/json_clean/'
data_cleaner = DataCleaner(data_raw=nhl_games_data, data_path_clean=data_path_clean)
for year in range(2016,2021):
    data_cleaner.clean_season(year, keepPreviousEventInfo=True, includePowerPlay=False)

In [21]:
import pandas as pd

data_2020 = data_cleaner.get_cleaned_data(2020)

In [22]:
data_2020.head()

Unnamed: 0,game_id,period,period_time,type,team,x,y,shooter,goalie,shot_type,empty_net,strength,opposite_team_side,prev_type,prev_x,prev_y,time_since_prev,distance_from_prev
0,2020020001,1,00:16,SHOT,Philadelphia Flyers,-74.0,29.0,Travis Konecny,Tristan Jarry,Wrist Shot,False,,left,FACEOFF,0.0,0.0,16,79.48
1,2020020001,1,00:34,SHOT,Pittsburgh Penguins,49.0,-25.0,Evan Rodrigues,Carter Hart,Wrist Shot,False,,right,BLOCKED_SHOT,-58.0,-7.0,6,108.5
2,2020020001,1,01:05,SHOT,Philadelphia Flyers,-52.0,-31.0,Joel Farabee,Tristan Jarry,Backhand,False,,left,HIT,-87.0,-33.0,2,35.06
3,2020020001,1,02:51,SHOT,Pittsburgh Penguins,43.0,39.0,Evan Rodrigues,Carter Hart,Snap Shot,False,,right,FACEOFF,69.0,22.0,42,31.06
4,2020020001,1,03:44,SHOT,Philadelphia Flyers,-53.0,8.0,Claude Giroux,Tristan Jarry,Slap Shot,False,,left,GIVEAWAY,-66.0,4.0,1,13.6


The `NaN` values for previous events is normal as some previous events are not in our interest (the columns are therefore filled with empty values).

In [23]:
data_2020.isna().sum()

game_id                   0
period                    0
period_time               0
type                      0
team                      0
x                         0
y                         0
shooter                   0
goalie                  277
shot_type                 0
empty_net                 0
strength              50044
opposite_team_side        0
prev_type                 0
prev_x                 2822
prev_y                 2822
time_since_prev           0
distance_from_prev     2822
dtype: int64

## Feature engineering

In [24]:
from ift6758.features import FeatureEng
data_path_clean = './../../ift6758/data/json_clean/'
w = FeatureEng(data_path_clean)

In [25]:
df = w.features_2(2016,2020)
df.sample(10)

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,prev_type,prev_x,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
288755,2019020929,2,2713,68.0,0.0,Deflected,MISSED_SHOT,63.0,-4.0,16.0,6.4,22.0,27.29,0.0,-8.43,True,8.43,0.4
215022,2018020994,3,4508,-57.0,6.0,Wrist Shot,HIT,-21.0,-38.0,7.0,56.85,33.54,78.77,10.31,-28.84,False,0.0,8.12
88459,2017020215,3,4476,-62.0,-12.0,Wrist Shot,SHOT,43.0,-7.0,16.0,105.12,30.46,133.18,-23.2,-3.01,True,-20.19,6.57
294307,2019021022,3,4527,-44.0,-17.0,Wrist Shot,HIT,-90.0,-31.0,23.0,48.08,49.04,31.0,-20.28,-90.0,False,0.0,2.09
165191,2018020180,1,1728,72.0,-20.0,Snap Shot,HIT,98.0,-21.0,1.0,26.02,26.91,22.47,-48.01,-69.16,False,0.0,26.02
261236,2019020478,3,3710,-56.0,30.0,Wrist Shot,BLOCKED_SHOT,-66.0,2.0,11.0,29.73,45.34,24.08,41.43,4.76,True,36.67,2.7
237724,2019020094,3,4429,-54.0,-32.0,Wrist Shot,FACEOFF,69.0,-22.0,47.0,123.41,48.17,160.51,-41.63,-7.88,False,0.0,2.63
195056,2018020668,1,1252,-32.0,-10.0,Snap Shot,FACEOFF,-69.0,-22.0,3.0,38.9,58.86,30.41,-9.78,-46.34,False,0.0,12.97
225130,2018021159,3,3925,-78.0,6.0,Wrist Shot,BLOCKED_SHOT,-61.0,-19.0,18.0,30.23,13.42,34.67,26.56,-33.23,True,59.79,1.68
261810,2019020488,2,3325,-69.0,-12.0,Backhand,SHOT,-55.0,2.0,5.0,19.8,24.19,35.06,-29.74,3.27,True,-33.01,3.96


In [26]:
df.isna().sum()

game_id                  0
period                   0
game_seconds             0
x                        0
y                        0
shot_type                0
prev_type                0
prev_x                4076
prev_y                4075
time_since_prev          0
distance_from_prev    4076
distance_goal            0
prev_distance_goal    4076
angle_shot               0
prev_angle_shot       4076
bounce                   0
angle_change             2
speed                    0
dtype: int64

## Bad data

Annoying SHOT types that have no coordinates (and therefore no distance or angles)
The second one is a during a shootout. We have to see how we deal with shootout shots that have period times of 0.

In [27]:
df[df.angle_change.isna()]

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,prev_type,prev_x,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
48612,2016020801,4,4989,-83.0,4.0,Wrist Shot,SHOT,,,4.0,,8.06,,29.75,,True,,
253373,2019020349,5,6000,-72.0,-1.0,Slap Shot,SHOT,,,0.0,,18.03,,-3.18,,True,,instant


Annoying previous event that only has y coordinate.

In [28]:
df[df.prev_x.isna() & df.prev_y.notna()]

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,prev_type,prev_x,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
109854,2017020563,1,2110,36.0,-29.0,Snap Shot,HIT,,-17.0,33.0,,61.29,,-28.24,,False,0.0,


## Log data

We log game "Winnipeg vs Washington" the 12th march 2018 (game 2017021065) into Comet.ml

In [32]:
game_df = df[df.game_id == 2017021065]
game_df.head()

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,prev_type,prev_x,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
141533,2017021065,1,1311,-50.0,36.0,Snap Shot,HIT,72.0,37.0,11.0,122.0,53.81,166.17,41.99,12.87,False,0.0,11.09
141534,2017021065,1,1315,-85.0,-25.0,Wrist Shot,SHOT,-50.0,36.0,4.0,70.33,25.5,53.81,-78.64,41.99,True,-120.63,17.58
141535,2017021065,1,1324,73.0,-16.0,Backhand,TAKEAWAY,-39.0,-28.0,1.0,112.64,23.35,132.0,-43.25,-12.25,False,0.0,112.64
141536,2017021065,1,1351,-29.0,-6.0,Slap Shot,HIT,10.0,38.0,5.0,58.8,61.29,106.98,-5.62,20.81,False,0.0,11.76
141537,2017021065,1,1359,23.0,-34.0,Wrist Shot,SHOT,-29.0,-6.0,8.0,59.06,75.13,119.15,-26.91,-2.89,True,-24.02,7.38


In [33]:
import os
from comet_ml import Experiment

exp = Experiment(
        api_key=os.environ.get('COMET_API_KEY'),
        workspace='ift6758-a5-nhl',
        project_name='milestone2'
    )

exp.log_dataframe_profile(
    game_df,
    name='wpg_v_wsh_2017021065',
    dataframe_format='csv',
    )

exp.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/ift6758-a5-nhl/milestone2/cb8d64a7a50e4aa78fa7acc6a2577020
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     conda-environment-definition : 1
[1;38;5;39mCOMET INFO:[0m     conda-info                   : 1
[1;38;5;39mCOMET INFO:[0m     conda-specification          : 1
[1;38;5;39mCOMET INFO:[0m     environment details          : 1
[1;38;5;39mCOMET INFO:[0m     filename                     : 1
[1;38;5;39mCOMET INFO:[0m     git metadata                 : 1
[1;38;5;39mCOMET INFO:[0m     git-patch (uncompres

TODO: blog part