## Data retrieval and cleaning

Don't forget to delete your `json_clean` if you make any modifications `cleaning.py`!


In [1]:
from ift6758.data.acquisition import NHLGameData

data_path_raw = './../../ift6758/data/json_raw/'
nhl_games_data = NHLGameData(data_path_raw)
for year in range(2016,2021):
    nhl_games_data.fetch_season(year)

Loading from cache file ./../../ift6758/data/json_raw/2016/2016-regular.pkl
Found 1230 regular games for season 2016-2017
Loading from cache file ./../../ift6758/data/json_raw/2016/2016-playoff.pkl
Found 102 playoff games for season 2016-2017
Loading from cache file ./../../ift6758/data/json_raw/2017/2017-regular.pkl
Found 1271 regular games for season 2017-2018
Loading from cache file ./../../ift6758/data/json_raw/2017/2017-playoff.pkl
Found 105 playoff games for season 2017-2018
Loading from cache file ./../../ift6758/data/json_raw/2018/2018-regular.pkl
Found 1271 regular games for season 2018-2019
Loading from cache file ./../../ift6758/data/json_raw/2018/2018-playoff.pkl
Found 105 playoff games for season 2018-2019
Loading from cache file ./../../ift6758/data/json_raw/2019/2019-regular.pkl
Found 1271 regular games for season 2019-2020
Loading from cache file ./../../ift6758/data/json_raw/2019/2019-playoff.pkl
Found 105 playoff games for season 2019-2020
Loading from cache file ./..

In [2]:
from ift6758.data.cleaning import DataCleaner

data_path_clean = './../../ift6758/data/json_clean/'
data_cleaner = DataCleaner(data_raw=nhl_games_data, data_path_clean=data_path_clean)
for year in range(2016,2021):
    data_cleaner.clean_season(year, keepPreviousEventInfo=True, includePowerPlay=True)

In [3]:
import pandas as pd

data_2020 = data_cleaner.get_cleaned_data(2020)

In [4]:
data_2020.head()

Unnamed: 0,game_id,period,period_time,type,team,x,y,shooter,goalie,shot_type,...,opposite_team_side,PPActive,PPTimeElapsed,HomeSkaters,AwaySkaters,prev_type,prev_x,prev_y,time_since_prev,distance_from_prev
0,2020020001,1,00:16,SHOT,Philadelphia Flyers,-74.0,29.0,Travis Konecny,Tristan Jarry,Wrist Shot,...,left,False,0,5,5,FACEOFF,0.0,0.0,16,79.48
1,2020020001,1,00:34,SHOT,Pittsburgh Penguins,49.0,-25.0,Evan Rodrigues,Carter Hart,Wrist Shot,...,right,False,0,5,5,BLOCKED_SHOT,-58.0,-7.0,6,108.5
2,2020020001,1,01:05,SHOT,Philadelphia Flyers,-52.0,-31.0,Joel Farabee,Tristan Jarry,Backhand,...,left,False,0,5,5,HIT,-87.0,-33.0,2,35.06
3,2020020001,1,02:51,SHOT,Pittsburgh Penguins,43.0,39.0,Evan Rodrigues,Carter Hart,Snap Shot,...,right,False,0,5,5,FACEOFF,69.0,22.0,42,31.06
4,2020020001,1,03:44,SHOT,Philadelphia Flyers,-53.0,8.0,Claude Giroux,Tristan Jarry,Slap Shot,...,left,False,0,5,5,GIVEAWAY,-66.0,4.0,1,13.6


The `NaN` values for previous events is normal as some previous events are not in our interest (the columns are therefore filled with empty values).

In [5]:
data_2020.isna().sum()

game_id                   0
period                    0
period_time               0
type                      0
team                      0
x                         0
y                         0
shooter                   0
goalie                  277
shot_type                 0
empty_net                 0
strength              50044
opposite_team_side        0
PPActive                  0
PPTimeElapsed             0
HomeSkaters               0
AwaySkaters               0
prev_type                 0
prev_x                 2822
prev_y                 2822
time_since_prev           0
distance_from_prev     2822
dtype: int64

## Feature engineering

In [6]:
from ift6758.features import FeatureEng
data_path_clean = './../../ift6758/data/json_clean/'
w = FeatureEng(data_path_clean)

In [7]:
df = w.features_2(2016,2020)
df.sample(10)

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,PPActive,PPTimeElapsed,HomeSkaters,AwaySkaters,...,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
231500,2018021263,2,2595,63.0,35.0,Wrist Shot,False,0,5,5,...,38.0,1.0,15.3,44.2,39.85,52.36,72.47,False,0.0,15.3
253355,2019020349,3,3976,-84.0,21.0,Backhand,False,0,5,5,...,35.0,61.0,50.96,21.84,65.19,74.06,32.47,False,0.0,0.84
260603,2019020468,2,3421,-59.0,-14.0,Wrist Shot,False,0,5,5,...,-6.0,2.0,68.47,34.01,99.18,-24.31,-3.47,False,0.0,34.24
115004,2017020643,2,3018,72.0,-24.0,Wrist Shot,False,0,5,5,...,-38.0,1.0,81.22,30.0,105.11,-53.13,-21.19,False,0.0,81.22
60153,2016020987,3,4502,79.0,-10.0,Wrist Shot,True,119,5,4,...,7.0,4.0,45.31,14.87,53.46,-42.26,7.52,True,-49.78,11.33
57287,2016020942,2,2786,-58.0,-28.0,Slap Shot,False,0,5,5,...,-22.0,10.0,12.53,42.52,30.41,-41.19,-46.34,False,0.0,1.25
220725,2018021088,2,3394,-78.0,5.0,Deflected,False,0,5,5,...,-10.0,14.0,16.55,13.0,21.47,22.62,-27.76,True,50.38,1.18
185194,2018020502,5,6000,77.0,1.0,Wrist Shot,False,0,5,5,...,-5.0,0.0,157.11,13.04,170.07,4.4,-1.68,True,6.08,instant
128372,2017020855,2,2770,58.0,-29.0,Slap Shot,False,0,5,5,...,-9.0,8.0,156.28,43.19,187.22,-42.18,-2.76,False,0.0,19.54
285800,2019020879,2,2827,-59.0,-10.0,Wrist Shot,False,0,5,5,...,36.0,11.0,51.88,32.57,36.67,-17.88,79.03,False,0.0,4.72


In [8]:
df.isna().sum()

game_id                  0
period                   0
game_seconds             0
x                        0
y                        0
shot_type                0
PPActive                 0
PPTimeElapsed            0
HomeSkaters              0
AwaySkaters              0
prev_type                0
prev_x                4076
prev_y                4075
time_since_prev          0
distance_from_prev    4076
distance_goal            0
prev_distance_goal    4076
angle_shot               0
prev_angle_shot       4076
bounce                   0
angle_change             2
speed                    0
dtype: int64

Annoying SHOT types that have no coordinates (and therefore no distance or angles)
The second one is a during a shootout. We have to see how we deal with shootout shots that have period times of 0.

In [9]:
df[df.angle_change.isna()]

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,PPActive,PPTimeElapsed,HomeSkaters,AwaySkaters,...,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
48612,2016020801,4,4989,-83.0,4.0,Wrist Shot,False,0,5,5,...,,4.0,,8.06,,29.75,,True,,
253373,2019020349,5,6000,-72.0,-1.0,Slap Shot,False,0,5,5,...,,0.0,,18.03,,-3.18,,True,,instant


Annoying previous event that only has y coordinate. We keep it for the speed and time between events.

In [10]:
df[df.prev_x.isna() & df.prev_y.notna()]

Unnamed: 0,game_id,period,game_seconds,x,y,shot_type,PPActive,PPTimeElapsed,HomeSkaters,AwaySkaters,...,prev_y,time_since_prev,distance_from_prev,distance_goal,prev_distance_goal,angle_shot,prev_angle_shot,bounce,angle_change,speed
109854,2017020563,1,2110,36.0,-29.0,Snap Shot,False,0,5,5,...,-17.0,33.0,,61.29,,-28.24,,False,0.0,
