In [2]:
%load_ext autoreload
%autoreload 2

In [96]:
from  ift6758.data.data_acquisition import Season
import pandas as pd
import numpy as np
import ift6758.features.utilities as utilities
import os
class SeasonDataSetTwo:
    def __init__(self, years):
        """
        param years : array of in : a year seasons
        """
        self.years = years
        self.years.sort()

    def combine_season_periods(self):
        """
        Combine Seasons Info with their periods to get goal coordinates.
        Correct goal coordinates incase they are not existing (Means they are overtime shootouts)
        Can get extra features from periods df as well

        rtype : Pandas DataFrame
        return : The DataFrame With the infos

        @Author : Sai Kalyan (took by Yassir Mamouni for his branch)
        """
        def correctionCoordinates(r):
            if isinstance(r["goalCoordinates"], tuple):
                return r["goalCoordinates"]
            else:
                if r["coordinates.x"]>0:
                    return (89,0)
                else:
                    return (-89,0)
        seasons= []
        periods=[]
        for yr in self.years:
            season = Season(yr,"../ift6758/data")
            df_season = season.clean_data_all_events()
            df_period = season.periodInfo()
            seasons.append(df_season)
            periods.append(df_period)

        df_seasons = pd.concat(seasons).reset_index(drop=True)
        df_periods = pd.concat(periods).reset_index(drop=True)
        map_columns = {"periodType": "about.periodType", "num": "about.period","teamname":"team.name","isHomeTeam":"isHome"}
        df_periods_to_join = df_periods[list(map_columns.keys())+["gamePk","goalCoordinates"]].rename(columns=map_columns)
        df_seasons_periods = df_seasons.merge(df_periods_to_join, how='left',on=["about.periodType","about.period","team.name","gamePk"])
        df_seasons_periods["goalCoordinates"] = df_seasons_periods.apply(lambda r: correctionCoordinates(r),axis=1)

        return df_seasons_periods

    def get_features_bonus(self):
        """
        Getting df with all the features [isGoal,distanceNet,angleNet,emptynet]
        type : Pandas DataFrame
        return : The DataFrame With the basic feature for FE II (4 of Milestone 2) 
        """
        def getPenaltyTime(row,penalty_period_dict):
            gameSec = row["totalGameSeconds"]
            gamePk = row["gamePk"]
            penalty_periods = penalty_period_dict['penaltyTimePeriods'][gamePk]
            for period in penalty_periods:
                if gameSec>period[0] and gameSec<=period[1]:
                    return gameSec-period[0]
                elif gameSec < period[0]:
                    return 0
            return 0
        def time_played(row):
            """
            return time in seconds
            """
            if row['about.period']>3:
                ## Overtime is 5 mins and It can go till Shootouts
                time_secs = 3600 + (row['about.period']-4)*300 + row['gameSeconds']
                return time_secs
            else:
                time_secs =  (row['about.period']-1)*1200 + row['gameSeconds']
                return time_secs
        DIRECTORY  = f"../ift6758/data/PICKLE/"
        file_years = "_".join(map(str,self.years))
        PATH = f"{DIRECTORY}/{file_years}_features_bonus.pkl"
        if os.path.isfile(PATH):
            # print(f"File already Exists, loading from {PATH}")
            # df_clean = pd.read_pickle(PATH)
            # return df_clean
            df_seasons_periods =pd.read_pickle(PATH)
        else:
            df_seasons_periods = self.combine_season_periods()
        # df_seasons_periods =df_seasons_periods[df_seasons_periods["gamePk"]==2017021065]
            df_seasons_periods.to_pickle(PATH)
        # df_seasons_periods =df_seasons_periods[df_seasons_periods["gamePk"]==2017020003]



        #GameSeconds
        df_seasons_periods['gameSeconds'] = pd.to_timedelta('00:' + df_seasons_periods['about.periodTime'].astype(str)) #concat '00:' to have the format 'hh:mm:ss'
        df_seasons_periods['gameSeconds'] = df_seasons_periods['gameSeconds'].dt.total_seconds()
        df_seasons_periods["totalGameSeconds"] = df_seasons_periods[["gameSeconds","about.period"]].apply(lambda r: time_played(r),axis=1)

        ##Bonus
        print(f"Doing Bonus Features")
        dfpenaltyGoals = df_seasons_periods.loc[(df_seasons_periods["result.event"]=="Goal")|(df_seasons_periods["result.event"]=="Penalty")]
        dfgameevent_group=dfpenaltyGoals[["gamePk","result.event","about.periodType","isHome","totalGameSeconds","result.penaltyMinutes"]].groupby(["gamePk","result.event"]).agg(lambda x: list(x)).reset_index()
        dfgame_group = dfgameevent_group.groupby(["gamePk"]).agg(lambda x: list(x)).reset_index()
        dfgame_group["penaltyAdditions"]=dfgame_group.apply(lambda row: utilities.penalty_time_dict(row),axis=1 )


        dfgame_group["penaltyTimePeriods"] = dfgame_group["penaltyAdditions"].apply(lambda row: utilities.getPenaltyTimePeriods(row))
        penalty_add_dict=dfgame_group[["gamePk","penaltyAdditions"]].set_index(["gamePk"]).to_dict()["penaltyAdditions"]
        penalty_period_dict=dfgame_group[["gamePk","penaltyTimePeriods"]].set_index(["gamePk"]).to_dict()
        self.penalty_period_dict = penalty_period_dict

        penalty_add_df = pd.DataFrame.from_dict({(i,j): penalty_add_dict[i][j] 
                                for i in penalty_add_dict.keys() 
                                for j in penalty_add_dict[i].keys()},
                            orient='index')
                            
        penalty_add_df.index = penalty_add_df.index.set_names(['gamePk','totalGameSeconds'])

        penalty_add_df.reset_index(inplace=True)
        df_seasons_periods = pd.concat([df_seasons_periods,penalty_add_df],axis=0,ignore_index=True)
        df_seasons_periods = df_seasons_periods.sort_values(['gamePk','totalGameSeconds'])
        df_seasons_periods.loc[:,["awayAddition","homeAddition"]]=df_seasons_periods.loc[:,["awayAddition","homeAddition"]].fillna(0)
        df_seasons_periods[["awayCum","homeCum"]]= df_seasons_periods.groupby(["gamePk"])[["awayAddition","homeAddition"]].cumsum()
        df_seasons_periods["homePlayers"] = df_seasons_periods[["homeCum","about.periodType"]].apply(lambda r: 5+r["homeCum"] if r["about.periodType"]=="REGULAR" else 3+r["homeCum"],axis=1)
        df_seasons_periods["awayPlayers"] = df_seasons_periods[["awayCum","about.periodType"]].apply(lambda r: 5+r["awayCum"] if r["about.periodType"]=="REGULAR" else 3+r["awayCum"],axis=1)
        df_seasons_periods.dropna(subset=["gameSeconds"],inplace=True)
        df_seasons_periods["penaltyTime"] = df_seasons_periods.apply(lambda r: getPenaltyTime(r,penalty_period_dict),axis=1)
        df_seasons_periods["friendlyPlayers"] = df_seasons_periods.apply(lambda r: r["homePlayers"]if r["isHome"] else r["awayPlayers"],axis=1)
        df_seasons_periods["opposingPlayers"] = df_seasons_periods.apply(lambda r:  r["awayPlayers"]if r["isHome"] else r["homePlayers"],axis=1)
        print(f"Done Bonus Features")

        #We already have Game Period, Coordinates, Shot Type,
        df_seasons_periods["result.emptyNet"] = df_seasons_periods["result.emptyNet"].fillna(0)
        df_seasons_periods["distanceNet"]= df_seasons_periods[['coordinates.x','coordinates.y','goalCoordinates']].apply(lambda r: utilities.distance(r["goalCoordinates"],(r["coordinates.x"],r["coordinates.y"])), axis=1)
        df_seasons_periods["angleNet"]= df_seasons_periods[['coordinates.x','coordinates.y','goalCoordinates']].apply(lambda r: utilities.angle(r["goalCoordinates"],(r["coordinates.x"],r["coordinates.y"])), axis=1)
        df_seasons_periods["isGoal"] =df_seasons_periods[["result.event"]].apply(lambda r: 1 if (r["result.event"]=="Goal") else 0,axis=1) ## Add ISGOAL
        #df_seasons_periods.dropna(subset=['coordinates.x','coordinates.y'],inplace=True) ## Sometimes X-cooridnates and Y-cordinates are Nans removing them, need to understand why they are mssing later

        ###################################
        ### ALA's PART: last event features
        ###################################

        df = df_seasons_periods
    
        #shift all relevant info by one to have the last event information
        # keep the gamePK id and the period for a sanity check
        #this is to prevent leaking between games/periods
        df['last.event.gamePk'] = df['gamePk'].shift(1)
        df['last.event.about.period'] = df['about.period'].shift(1)
        df['lastEventType'] = df['result.event'].shift(1)
        df['last.event.about.periodTime'] = df['about.periodTime'].shift(1)
        df['last.event.coordinates.x'] = df['coordinates.x'].shift(1)
        df['last.event.coordinates.y'] = df['coordinates.y'].shift(1)
        ##
        df['last.event.angleNet'] = df['angleNet'].shift(1)
        
        mask = ((df["result.event"]=="Shot") | (df["result.event"]=="Goal")) & (df['last.event.gamePk'] == df['gamePk']) & (df['last.event.about.period'] == df['about.period'])
        df_masked = df[mask]
        
        #Calculate time between this event and last event in seconds
        df_masked['timeFromLastEvent'] = pd.to_datetime(df_masked['about.periodTime'], format='%M:%S') - pd.to_datetime(df_masked['last.event.about.periodTime'], format='%M:%S')
       
        #Calculate distance between this event and last event in feet
        df_masked['distanceFromLastEvent'] = df_masked.apply(
            lambda row: np.linalg.norm(np.array([row['last.event.coordinates.x'], row['last.event.coordinates.y']])-np.array([row['coordinates.x'], row['coordinates.y']])),
            axis=1)
        
        #add rebound if last event is shot
        df_masked['Rebound'] = np.where(df_masked['lastEventType']=='Shot', True, False)
        
        #convert timeFromLastEvent column to seconds
        df_masked['timeFromLastEvent'] = df_masked['timeFromLastEvent'].dt.total_seconds()
        
        #add speed = dist/time
        df_masked['Speed'] = df_masked['distanceFromLastEvent'] / df_masked['timeFromLastEvent']

        #calculate angle difference
        df_masked['changeInShotAngle'] = np.where(df_masked['Rebound']==True, np.abs(df_masked['angleNet']-df_masked['last.event.angleNet']) , 0)

        #Angle Speed
        df_masked['angleSpeed'] = df_masked['changeInShotAngle'] / df_masked['timeFromLastEvent']

        #drop unneeded columns
        df_clean = df_masked.drop(columns=["result.event","about.periodTime","about.periodType","about.periodTimeRemaining","goalCoordinates","last.event.gamePk","last.event.about.period","last.event.about.periodTime","last.event.angleNet","result.strength.name","result.penaltySeverity","result.penaltyMinutes",'awayAddition', 'homeAddition', 'awayCum', 'homeCum',"homePlayers","awayPlayers"],axis=1).reset_index(drop=True)
        
        df_clean = df_clean.rename({'about.period': 'gamePeriod', 'result.emptyNet': 'emptyNet', 'coordinates.x': 'coordinatesX', 'coordinates.y': 'coordinatesY', 'distanceNet': 'shotDistance', 'angleNet': 'shotAngle', 'result.secondaryType': 'shotType', 'last.event.coordinates.x': 'lastEventCoordinatesX', 'last.event.coordinates.y': 'lastEventCoordinatesY', 'Rebound':'rebound', 'Speed':'speed'}, axis='columns', errors='raise')

        df_clean.to_pickle(PATH)


        return df_clean


In [97]:
# from ift6758.features.feature_engineering2 import SeasonDataSetTwo
years = [2017]
data = SeasonDataSetTwo(years)

In [98]:
# data.penalty_period_dict

In [100]:
df = data.get_features_bonus()

Doing Bonus Features
Done Bonus Features


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['timeFromLastEvent'] = pd.to_datetime(df_masked['about.periodTime'], format='%M:%S') - pd.to_datetime(df_masked['last.event.about.periodTime'], format='%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['distanceFromLastEvent'] = df_masked.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [102]:
pd.set_option('display.max_columns', None)

subset_df = df[df['gamePk']==2017021065].drop(["gamePk","team.name","emptyNet"],axis=1)
subset_df.head(5)

Unnamed: 0,gamePeriod,coordinatesX,coordinatesY,shotType,isHome,gameSeconds,totalGameSeconds,penaltyTime,friendlyPlayers,opposingPlayers,shotDistance,shotAngle,isGoal,lastEventType,lastEventCoordinatesX,lastEventCoordinatesY,timeFromLastEvent,distanceFromLastEvent,rebound,speed,changeInShotAngle,angleSpeed
68667,1.0,-50.0,36.0,Snap Shot,True,111.0,111.0,0.0,5.0,5.0,53.075418,42.70939,0,Hit,72.0,37.0,11.0,122.004098,False,11.091282,0.0,0.0
68668,1.0,-85.0,-25.0,Wrist Shot,True,115.0,115.0,0.0,5.0,5.0,25.317978,-80.909723,0,Shot,-50.0,36.0,4.0,70.327804,True,17.581951,123.619113,30.904778
68669,1.0,73.0,-16.0,Backhand,False,124.0,124.0,0.0,5.0,5.0,22.627417,45.0,0,Takeaway,-39.0,-28.0,1.0,112.641023,False,112.641023,0.0,0.0
68670,1.0,-29.0,-6.0,Slap Shot,True,151.0,151.0,0.0,5.0,5.0,60.299254,-5.710593,0,Hit,10.0,38.0,5.0,58.796258,False,11.759252,0.0,0.0
68671,1.0,23.0,-34.0,Wrist Shot,False,159.0,159.0,0.0,5.0,5.0,74.242845,27.255328,0,Shot,-29.0,-6.0,8.0,59.059292,True,7.382412,32.965922,4.12074


In [105]:
subset_df.columns

Index(['gamePeriod', 'coordinatesX', 'coordinatesY', 'shotType', 'isHome',
       'gameSeconds', 'totalGameSeconds', 'penaltyTime', 'friendlyPlayers',
       'opposingPlayers', 'shotDistance', 'shotAngle', 'isGoal',
       'lastEventType', 'lastEventCoordinatesX', 'lastEventCoordinatesY',
       'timeFromLastEvent', 'distanceFromLastEvent', 'rebound', 'speed',
       'changeInShotAngle', 'angleSpeed'],
      dtype='object')

In [103]:
import os
from comet_ml import Experiment

'COMET_API_KEY' in os.environ

True

In [104]:
experiment = Experiment(
    api_key=os.environ.get('COMET_API_KEY'),
    project_name='feature_engineering_data',
    workspace='morph-e',
)



COMET INFO: Experiment is live on comet.ml https://www.comet.ml/morph-e/feature-engineering-data/0a9c5160f85e4b97b90ce2902d446206



In [106]:
experiment.log_dataframe_profile(
subset_df, 
name='wpg_v_wsh_2017021065',  # keep this name
dataframe_format='csv'  # ensure you set this flag!
)

  x = asanyarray(arr - arrmean)
Summarize dataset: 100%|██████████| 263/263 [01:18<00:00,  3.35it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:17<00:00, 17.82s/it]
Render HTML: 100%|██████████| 1/1 [00:09<00:00,  9.52s/it]


{'profile': {'web': 'https://www.comet.ml/api/asset/download?assetId=9adb7930843b4f4b97064e08432eae84&experimentKey=0a9c5160f85e4b97b90ce2902d446206',
  'api': 'https://www.comet.ml/api/rest/v2/experiment/asset/get-asset?assetId=9adb7930843b4f4b97064e08432eae84&experimentKey=0a9c5160f85e4b97b90ce2902d446206',
  'assetId': '9adb7930843b4f4b97064e08432eae84'},
 'dataframe': {'web': 'https://www.comet.ml/api/asset/download?assetId=d7a6673691184079b3720f5215a2d5e9&experimentKey=0a9c5160f85e4b97b90ce2902d446206',
  'api': 'https://www.comet.ml/api/rest/v2/experiment/asset/get-asset?assetId=d7a6673691184079b3720f5215a2d5e9&experimentKey=0a9c5160f85e4b97b90ce2902d446206',
  'assetId': 'd7a6673691184079b3720f5215a2d5e9'}}

In [107]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/morph-e/feature-engineering-data/0a9c5160f85e4b97b90ce2902d446206
COMET INFO:   Uploads:
COMET INFO:     dataframe                : 1 (12.97 KB)
COMET INFO:     dataframe-profile        : 1 (6.93 MB)
COMET INFO:     environment details      : 1
COMET INFO:     filename                 : 1
COMET INFO:     git metadata             : 1
COMET INFO:     git-patch (uncompressed) : 1 (312.95 KB)
COMET INFO:     installed packages       : 1
COMET INFO:     notebook                 : 1
COMET INFO:     source_code              : 1
COMET INFO: ---------------------------
COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)
COMET INFO: The Python SDK has 3600 seconds to finish before aborting...
COME

In [88]:
df[df["gamePk"]==2017021065]
# .to_csv("dummy.csv")


Unnamed: 0,gamePk,team.name,gamePeriod,coordinatesX,coordinatesY,shotType,emptyNet,isHome,gameSeconds,totalGameSeconds,...,isGoal,lastEventType,lastEventCoordinatesX,lastEventCoordinatesY,timeFromLastEvent,distanceFromLastEvent,rebound,speed,changeInShotAngle,angleSpeed
68667,2017021065,Washington Capitals,1.0,-50.0,36.0,Snap Shot,0,True,111.0,111.0,...,0,Hit,72.0,37.0,11.0,122.004098,False,11.091282,0.000000,0.000000
68668,2017021065,Washington Capitals,1.0,-85.0,-25.0,Wrist Shot,0,True,115.0,115.0,...,0,Shot,-50.0,36.0,4.0,70.327804,True,17.581951,123.619113,30.904778
68669,2017021065,Winnipeg Jets,1.0,73.0,-16.0,Backhand,0,False,124.0,124.0,...,0,Takeaway,-39.0,-28.0,1.0,112.641023,False,112.641023,0.000000,0.000000
68670,2017021065,Washington Capitals,1.0,-29.0,-6.0,Slap Shot,0,True,151.0,151.0,...,0,Hit,10.0,38.0,5.0,58.796258,False,11.759252,0.000000,0.000000
68671,2017021065,Winnipeg Jets,1.0,23.0,-34.0,Wrist Shot,0,False,159.0,159.0,...,0,Shot,-29.0,-6.0,8.0,59.059292,True,7.382412,32.965922,4.120740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68733,2017021065,Washington Capitals,4.0,71.0,-8.0,Snap Shot,0,True,95.0,3695.0,...,0,Shot,76.0,-5.0,7.0,5.830952,True,0.832993,2.924978,0.417854
68734,2017021065,Washington Capitals,4.0,58.0,10.0,Slap Shot,0,True,106.0,3706.0,...,0,Shot,71.0,-8.0,11.0,22.203603,True,2.018509,41.841186,3.803744
68735,2017021065,Washington Capitals,4.0,60.0,23.0,Slap Shot,0,True,134.0,3734.0,...,0,Hit,-61.0,-32.0,14.0,132.913506,False,9.493822,0.000000,0.000000
68736,2017021065,Washington Capitals,4.0,74.0,1.0,Wrist Shot,0,True,207.0,3807.0,...,0,Hit,57.0,-12.0,2.0,21.400935,False,10.700467,0.000000,0.000000


In [64]:
df[df["gamePK"]==2017021065]
# .to_csv("dummy.csv")


Unnamed: 0,gamePk,team.name,gamePeriod,coordinatesX,coordinatesY,shotType,emptyNet,isHome,gameSeconds,totalGameSeconds,...,isGoal,lastEventType,lastEventCoordinatesX,lastEventCoordinatesY,timeFromLastEvent,distanceFromLastEvent,rebound,speed,changeInShotAngle,angleSpeed
0,2017021065,Washington Capitals,1.0,-50.0,36.0,Snap Shot,0,True,111.0,111.0,...,0,Hit,72.0,37.0,11.0,122.004098,False,11.091282,0.000000,0.000000
1,2017021065,Washington Capitals,1.0,-85.0,-25.0,Wrist Shot,0,True,115.0,115.0,...,0,Shot,-50.0,36.0,4.0,70.327804,True,17.581951,123.619113,30.904778
2,2017021065,Winnipeg Jets,1.0,73.0,-16.0,Backhand,0,False,124.0,124.0,...,0,Takeaway,-39.0,-28.0,1.0,112.641023,False,112.641023,0.000000,0.000000
3,2017021065,Washington Capitals,1.0,-29.0,-6.0,Slap Shot,0,True,151.0,151.0,...,0,Hit,10.0,38.0,5.0,58.796258,False,11.759252,0.000000,0.000000
4,2017021065,Winnipeg Jets,1.0,23.0,-34.0,Wrist Shot,0,False,159.0,159.0,...,0,Shot,-29.0,-6.0,8.0,59.059292,True,7.382412,32.965922,4.120740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2017021065,Washington Capitals,4.0,71.0,-8.0,Snap Shot,0,True,95.0,3695.0,...,0,Shot,76.0,-5.0,7.0,5.830952,True,0.832993,2.924978,0.417854
67,2017021065,Washington Capitals,4.0,58.0,10.0,Slap Shot,0,True,106.0,3706.0,...,0,Shot,71.0,-8.0,11.0,22.203603,True,2.018509,41.841186,3.803744
68,2017021065,Washington Capitals,4.0,60.0,23.0,Slap Shot,0,True,134.0,3734.0,...,0,Hit,-61.0,-32.0,14.0,132.913506,False,9.493822,0.000000,0.000000
69,2017021065,Washington Capitals,4.0,74.0,1.0,Wrist Shot,0,True,207.0,3807.0,...,0,Hit,57.0,-12.0,2.0,21.400935,False,10.700467,0.000000,0.000000


In [14]:
DIRECTORY  = f"../ift6758/data/PICKLE/"
file_years = "2017"
PATH = f"{DIRECTORY}/{file_years}_features_bonus.pkl"
if os.path.isfile(PATH):
    print(f"File already Exists, loading from {PATH}")
    # df_clean = pd.read_pickle(PATH)
    # return df_clean
    df_seasons_periods =pd.read_pickle(PATH)


File already Exists, loading from ../ift6758/data/PICKLE//2017_features_bonus.pkl


In [15]:
df_seasons_periods

Unnamed: 0,result.event,result.penaltySeverity,result.penaltyMinutes,gamePk,team.name,about.period,about.periodTime,about.periodType,about.periodTimeRemaining,coordinates.x,coordinates.y,result.secondaryType,result.emptyNet,result.strength.name,isHome,goalCoordinates
