# Notebook for feature engineering II

In [1]:
%load_ext autoreload
%autoreload 2

#Imports
import pandas as pd
import requests
import os
from tqdm.notebook import tqdm
import pickle
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from  ift6758.data.data_acquisition import Season
from ift6758.visualizations.simple_visualization import Utilities 

In [2]:
season2015 = Season(2015,"../ift6758/data")
season2016 = Season(2016,"../ift6758/data")
season2017 = Season(2017,"../ift6758/data")
season2018 = Season(2018,"../ift6758/data")
season2019 = Season(2019,"../ift6758/data")

In [5]:
#data_2015 = season2015.get_season_data()
# data_2016 = season2016.get_season_data()
data_2017 = season2017.get_season_data()
# data_2018 = season2018.get_season_data()
# data_2019 = season2019.get_season_data()

File already Exists, loading from ../ift6758/data/PICKLE//2017.pkl
Len of games_list in 2017 is 1488


## From the already tidied data, include the following features:
Now, to each shot, add information from the previous events. Note that this could be any event, not just shots. This means you will likely have to update your code which produced the tidied data to now also look at other event types. As a reminder, you may find this endpoint useful:

https://statsapi.web.nhl.com/api/v1/playTypes

To each shot, add the following information from the immediately preceding event as four new features:
* Last event type
* Coordinates of the last event (x, y, separate columns)
* Time from the last event (seconds)
* Distance from the last event

With this new information, we will create a few more features which try to quantify a few more interesting things about the state of the play. Create the following three features:
* Rebound (bool): True if the last event was also a shot, otherwise False
* Change in shot angle; only include if the shot is a rebound, otherwise 0.
* “Speed”: defined as the distance from the previous event, divided by the time since the previous event. 

In [47]:
def clean_data_for_fe2(data):
        """
        Cleaning data for the data of season and add features for feature engineering 2 task
        """ 

        df = pd.json_normalize(data,record_path=['liveData','plays','allPlays'],meta=['gamePk'])
        select_columns = ["result.event","gamePk","team.name","about.period","about.periodTime","coordinates.x","coordinates.y"]
        df = df[select_columns]
    
        #shift all relevant info by one to have the last event information
        # keep the gamePK id and the period for a sanity check
        #this is to prevent leaking between games/periods
        df['last.event.gamePk'] = df['gamePk'].shift(1)
        df['last.event.about.period'] = df['about.period'].shift(1)
        df['lastEventType'] = df['result.event'].shift(1)
        df['last.event.about.periodTime'] = df['about.periodTime'].shift(1)
        df['last.event.coordinates.x'] = df['coordinates.x'].shift(1)
        df['last.event.coordinates.y'] = df['coordinates.y'].shift(1)
        
        mask = (df["result.event"]=="Shot") | (df["result.event"]=="Goal") & (df['last.event.gamePk'] == df['gamePk']) & (df['last.event.about.period'] == df['about.period'])
        df_masked = df[mask]
        
        #Calculate time between this event and last event in seconds
        df_masked['timeFromLastEvent'] = pd.to_datetime(df_masked['about.periodTime'], format='%M:%S') - pd.to_datetime(df_masked['last.event.about.periodTime'], format='%M:%S')
       
        #Calculate distance between this event and last event in feet
        df_masked['distanceFromLastEvent'] = df_masked.apply(
            lambda row: np.linalg.norm(np.array([row['last.event.coordinates.x'], row['last.event.coordinates.y']])-np.array([row['coordinates.x'], row['coordinates.y']])),
            axis=1)
        
        #add rebound if last event is shot
        df_masked['Rebound'] = np.where(df_masked['lastEventType']=='Shot', True, False)
        
        #convert timeFromLastEvent column to seconds
        df_masked['timeFromLastEvent'] = df_masked['timeFromLastEvent'].dt.total_seconds()
        
        #add speed = dist/time
        df_masked['Speed'] = df_masked['distanceFromLastEvent'] / df_masked['timeFromLastEvent']
        
        
        #drop unneeded columns
        df_clean = df_masked.drop(columns=["about.periodTime","last.event.gamePk","last.event.about.period"],axis=1).reset_index(drop=True)
        
        return df_clean


In [48]:
df = clean_data_for_fe2(data_2017)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['timeFromLastEvent'] = pd.to_datetime(df_masked['about.periodTime'], format='%M:%S') - pd.to_datetime(df_masked['last.event.about.periodTime'], format='%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['distanceFromLastEvent'] = df_masked.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [50]:
df.head()

Unnamed: 0,result.event,gamePk,team.name,about.period,coordinates.x,coordinates.y,lastEventType,last.event.about.periodTime,last.event.coordinates.x,last.event.coordinates.y,timeFromLastEvent,distanceFromLastEvent,Rebound,Speed
0,Shot,2017010001,Vancouver Canucks,1,-43.0,5.0,Faceoff,05:27,-20.0,22.0,2.0,28.600699,False,14.30035
1,Shot,2017010001,Vancouver Canucks,1,75.0,-13.0,Hit,05:48,-97.0,26.0,6.0,176.366097,False,29.394349
2,Shot,2017010001,Vancouver Canucks,1,84.0,-9.0,Shot,05:54,75.0,-13.0,2.0,9.848858,True,4.924429
3,Shot,2017010001,Los Angeles Kings,1,-70.0,1.0,Shot,05:56,84.0,-9.0,13.0,154.324334,True,11.871103
4,Shot,2017010001,Los Angeles Kings,1,-54.0,-25.0,Shot,06:09,-70.0,1.0,31.0,30.528675,True,0.984796


In [23]:
len(df.index)

92688

In [19]:
df.gamePk.unique()

array([2017010001, 2017010002, 2017010004, ..., 2017040631, 2017040632,
       2017040633], dtype=object)