# Notebook for feature engineering II

In [16]:
%load_ext autoreload
%autoreload 2

#Imports
import pandas as pd
import requests
import os
from tqdm.notebook import tqdm
import pickle
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from  ift6758.data.data_acquisition import Season
from ift6758.visualizations.simple_visualization import Utilities 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
season2015 = Season(2015,"../ift6758/data")
season2016 = Season(2016,"../ift6758/data")
season2017 = Season(2017,"../ift6758/data")
season2018 = Season(2018,"../ift6758/data")
season2019 = Season(2019,"../ift6758/data")

In [18]:
df_2015 = season2015.clean_data()
df_2016 = season2016.clean_data()
df_2017 = season2017.clean_data()
df_2018 = season2018.clean_data()
df_2019 = season2019.clean_data()

File already Exists, loading from ../ift6758/data/PICKLE//2015_clean.pkl
File already Exists, loading from ../ift6758/data/PICKLE//2016_clean.pkl
File already Exists, loading from ../ift6758/data/PICKLE//2017_clean.pkl
File already Exists, loading from ../ift6758/data/PICKLE//2018_clean.pkl
File already Exists, loading from ../ift6758/data/PICKLE//2019_clean.pkl


In [19]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76620 entries, 0 to 76619
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   result.event               76620 non-null  object 
 1   gamePk                     76620 non-null  object 
 2   team.name                  76620 non-null  object 
 3   about.period               76620 non-null  int64  
 4   about.periodTime           76620 non-null  object 
 5   about.periodType           76620 non-null  object 
 6   about.periodTimeRemaining  76620 non-null  object 
 7   coordinates.x              76619 non-null  float64
 8   coordinates.y              76619 non-null  float64
 9   result.secondaryType       76608 non-null  object 
 10  result.emptyNet            7157 non-null   object 
 11  result.strength.name       7361 non-null   object 
 12  shooter                    76620 non-null  object 
 13  goalie                     76251 non-null  obj

In [15]:
 df_2019['result.secondaryType'].sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of 0            Tip-In
1         Snap Shot
2         Snap Shot
3        Wrist Shot
4         Slap Shot
            ...    
76615     Snap Shot
76616    Wrist Shot
76617     Snap Shot
76618    Wrist Shot
76619    Wrist Shot
Name: result.secondaryType, Length: 76620, dtype: object>

## From the already tidied data, include the following features:
* Game seconds
* Game period
* Coordinates (x,y, separate columns)
* Shot distance
* Shot angle
* Shot type
* Empty net


### Here the *Game seconds* feature preprocessing

In [20]:
df_2019['about.periodTime']

0        00:25
1        01:31
2        03:23
3        03:56
4        04:47
         ...  
76615    16:01
76616    17:20
76617    18:50
76618    19:27
76619    19:45
Name: about.periodTime, Length: 76620, dtype: object

In [21]:
df_test = pd.DataFrame()
df_test['GameSeconds'] = pd.to_timedelta('00:' + df_2019['about.periodTime'].astype(str)) #concat '00:' to have the format 'hh:mm:ss'

In [22]:
df_test['GameSeconds'] = df_test['GameSeconds'].dt.total_seconds()

In [23]:
df_test

Unnamed: 0,GameSeconds
0,25.0
1,91.0
2,203.0
3,236.0
4,287.0
...,...
76615,961.0
76616,1040.0
76617,1130.0
76618,1167.0


In [10]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76620 entries, 0 to 76619
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   result.event               76620 non-null  object 
 1   gamePk                     76620 non-null  object 
 2   team.name                  76620 non-null  object 
 3   about.period               76620 non-null  int64  
 4   about.periodTime           76620 non-null  object 
 5   about.periodType           76620 non-null  object 
 6   about.periodTimeRemaining  76620 non-null  object 
 7   coordinates.x              76619 non-null  float64
 8   coordinates.y              76619 non-null  float64
 9   result.secondaryType       76608 non-null  object 
 10  result.emptyNet            7157 non-null   object 
 11  result.strength.name       7361 non-null   object 
 12  shooter                    76620 non-null  object 
 13  goalie                     76251 non-null  obj

In [15]:
df_2019

Unnamed: 0,result.event,gamePk,team.name,about.period,about.periodTime,about.periodType,about.periodTimeRemaining,coordinates.x,coordinates.y,result.secondaryType,result.emptyNet,result.strength.name,shooter,goalie
0,Goal,2019020001,Ottawa Senators,1,00:25,REGULAR,19:35,85.0,-1.0,Tip-In,False,Even,Brady Tkachuk,Frederik Andersen
1,Shot,2019020001,Toronto Maple Leafs,1,01:31,REGULAR,18:29,-32.0,-2.0,Snap Shot,,,Morgan Rielly,Craig Anderson
2,Shot,2019020001,Ottawa Senators,1,03:23,REGULAR,16:37,63.0,-6.0,Snap Shot,,,Dylan DeMelo,Frederik Andersen
3,Shot,2019020001,Toronto Maple Leafs,1,03:56,REGULAR,16:04,-59.0,-20.0,Wrist Shot,,,Morgan Rielly,Craig Anderson
4,Shot,2019020001,Toronto Maple Leafs,1,04:47,REGULAR,15:13,-42.0,-29.0,Slap Shot,,,Tyson Barrie,Craig Anderson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76615,Shot,2019030416,Tampa Bay Lightning,3,16:01,REGULAR,03:59,-59.0,-24.0,Snap Shot,,,Anthony Cirelli,Anton Khudobin
76616,Shot,2019030416,Tampa Bay Lightning,3,17:20,REGULAR,02:40,-81.0,3.0,Wrist Shot,,,Blake Coleman,Anton Khudobin
76617,Shot,2019030416,Dallas Stars,3,18:50,REGULAR,01:10,34.0,-6.0,Snap Shot,,,John Klingberg,Andrei Vasilevskiy
76618,Shot,2019030416,Dallas Stars,3,19:27,REGULAR,00:33,31.0,10.0,Wrist Shot,,,John Klingberg,Andrei Vasilevskiy


In [40]:
from ift6758.features.feature_engineering2 import SeasonDataSetTwo
years = [2015,2016,2017,2018]
new_2018 = SeasonDataSetTwo(2018)

In [41]:
df_new_2018 = new_2018.combine_season_periods()

TypeError: 'int' object is not iterable

In [None]:
df_new_2018 = new_2018.get_tidy_data()

In [None]:
df_new_2018.head()