In [42]:
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    RandomizedSearchCV,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant

### Data Cleaning and Preprocessing

In [43]:
gamelog_df = pd.read_csv("data/gamelogs.csv")
gamelog_df.shape

(7314, 67)

We first drop `goalsFor`, `goalsAgainst` since they are essentially what we are predicting. Furthermore, Some of the features are redundant and will me perfectly collinear with one another since a few features are a combination of context specific features (e.g. xGoalsFor is a combination of powerplay, shorthanded other and 5v5 xgoals). Thus, we will drop  `lowDangerxGoalsFor`,`mediumDangerxGoalsFor`, `highDangerxGoalsFor`, `lowDangerxGoalsAgainst`, `mediumDangerxGoalsAgainst`, `highDangerxGoalsAgainst`, `lowDangerGoalsFor`,`mediumDangerGoalsFor`, `highDangerGoalsFor`, `lowDangerGoalsAgainst`, `mediumDangerGoalsAgainst`, `highDangerGoalsAgainst`, `lowDangerShotsFor`, `mediumDangerShotsFor`, `highDangerShotsFor`, `lowDangerShotsAgainst`, `mediumDangerShotsAgainst`, and `highDangerShotsAgainst` since expected goals already accounts for shot quality thus shot quality specifc data would be redundant. Moreover, we also drop `savedShotsOnGoalFor`, and `savedShotsOnGoalAgainst` since they are dependent on shots on goal for/against. We also remove all corsi and fenwick percentage columns as they are a combination of shots on goal, shots blocked and other attempted shots; however, blocked and missed shot attempts likely do not have strong preditcive power for whether a team wins or loses thus we will focus on shots on goal and blocked shots specifically.

In [45]:
gamelog_features = gamelog_df.drop(columns = ["fenwickPercentage", "corsiPercentage",
                                    "goalsFor", "goalsAgainst",
                                    "lowDangerxGoalsFor","mediumDangerxGoalsFor", 
                                    "highDangerxGoalsFor", "lowDangerxGoalsAgainst",
                                    "mediumDangerxGoalsAgainst", "highDangerxGoalsAgainst",
                                    "lowDangerGoalsFor","mediumDangerGoalsFor", 
                                    "highDangerGoalsFor", "lowDangerGoalsAgainst",
                                    "mediumDangerGoalsAgainst", "highDangerGoalsAgainst", 
                                    "lowDangerShotsFor", "mediumDangerShotsFor",
                                    "highDangerShotsFor", "lowDangerShotsAgainst", 
                                    "mediumDangerShotsAgainst","highDangerShotsAgainst",
                                    "savedShotsOnGoalFor","savedShotsOnGoalAgainst",
                                    "fiveOnfiveFenwickPercentage", "fiveOnfiveCorsiPercentage"])

gamelog_features

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,home_or_away,situation,xGoalsFor,shotsOnGoalFor,blockedShotAttemptsFor,...,last_10_avg_xgoals_for,last_10_avg_xgoals_against,result,home_away,last_10_avg_goals_for,last_10_avg_goals_against,opp_last_10_goals_for,opp_last_10_goals_against,opp_last_10_xgoals_against,opp_last_10_xgoals_for
0,NYR,2022,2022020003,TBL,20221011,HOME,all,5.589,39,22,...,,,1,1,,,,,,
1,NYR,2022,2022020017,MIN,20221013,AWAY,all,3.165,35,13,...,5.58900,2.754000,1,0,3.000000,1.000000,,,,
2,NYR,2022,2022020023,WPG,20221014,AWAY,all,2.814,41,13,...,4.37700,3.605500,0,0,5.000000,2.000000,,,,
3,NYR,2022,2022020039,ANA,20221017,HOME,all,3.741,43,12,...,3.85600,3.810667,1,1,3.666667,2.666667,3.0,5.5,4.4490,2.3725
4,NYR,2022,2022020064,SJS,20221020,HOME,all,2.970,23,11,...,3.82725,3.265000,0,1,4.250000,3.000000,1.6,3.8,3.1180,2.5904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,AWAY,all,4.333,57,29,...,3.83100,2.475300,0,0,3.800000,2.700000,3.3,2.2,2.8051,3.7086
7310,CAR,2023,2023030223,NYR,20240509,HOME,all,4.027,47,17,...,3.97880,2.813800,0,1,3.700000,3.000000,3.2,2.3,3.0097,3.7584
7311,CAR,2023,2023030224,NYR,20240511,HOME,all,3.661,31,18,...,3.96580,2.848500,1,1,3.400000,3.100000,3.3,2.1,3.0895,3.6717
7312,CAR,2023,2023030225,NYR,20240513,AWAY,all,3.172,28,25,...,3.80200,2.943400,1,0,3.400000,3.200000,3.5,2.1,3.2085,3.5165


In [3]:
# create new dataframe that contains rolling averages of data as well as necessary categorical data
rolling_avg_df = pd.DataFrame()

features = gamelog_df.columns[0:5].tolist() + gamelog_df.columns[59:61].tolist()

for feature in features:
    rolling_avg_df[feature] = gamelog_df[feature]

columns_to_avg = gamelog_df.columns[7:57].tolist()

rolling_window = 5


for column in columns_to_avg:
    rolling_avg_df[f'{column}'] = (
        gamelog_df.groupby(["Team", "Season"], group_keys=False)[column].apply(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
    )

    first_game_mask = gamelog_df.groupby(["Team", "Season"]).cumcount() == 0
    rolling_avg_df.loc[first_game_mask, f'{column} '] = 0

Some of the features are redundant and will me perfectly collinear with one another since a few features are a combination of context specific features (e.g. xGoalsFor_last_5_games is a combination of powerplay, shorthanded other and 5v5 xgoals). Thus, we will drop `xGoalsFor_last_5_games`, `xGoalsAgainst_last_5_games`, `lowDangerxGoalsFor_last_5_games`,`mediumDangerxGoalsFor_last_5_games`, `highDangerxGoalsFor_last_5_games`, `lowDangerxGoalsAgainst_last_5_games`, `mediumDangerxGoalsAgainst_last_5_games`, `highDangerxGoalsAgainst_last_5_games`, `lowDangerGoalsFor_last_5_games`,`mediumDangerGoalsFor_last_5_games`, `highDangerGoalsFor_last_5_games`, `lowDangerGoalsAgainst_last_5_games`, `mediumDangerGoalsAgainst_last_5_games`, `highDangerGoalsAgainst_last_5_games`, `lowDangerShotsFor_last_5_games`, `mediumDangerShotsFor_last_5_games`, `highDangerShotsFor_last_5_games`, `lowDangerShotsAgainst_last_5_games`, `mediumDangerShotsAgainst_last_5_games`, and `highDangerShotsAgainst_last_5_games` since expected goals already accounts for shot quality thus shot quality specifc data would be redundant. Furthermore, we also drop `savedShotsOnGoalFor_last_5_games`, and `savedShotsOnGoalAgainst_last_5_games` since they are dependent on shots on goal for/against. We also remove all corsi and fenwick percentage columns as they are a combination of shots on goal, shots blocked and other attempted shots; however, blocked and missed shot attempts likely do not have strong preditcive power for whether a team wins or loses thus we will focus on shots on goal and blocked shots specifically.

In [32]:
gamelog_avg_df = rolling_avg_df.drop(columns = ["fenwickPercentage_last_5_games", "corsiPercentage_last_5_games",
                                    "xGoalsFor_last_5_games", "xGoalsAgainst_last_5_games",
                                    "lowDangerxGoalsFor_last_5_games","mediumDangerxGoalsFor_last_5_games", 
                                    "highDangerxGoalsFor_last_5_games", "lowDangerxGoalsAgainst_last_5_games",
                                    "mediumDangerxGoalsAgainst_last_5_games", "highDangerxGoalsAgainst_last_5_games",
                                    "lowDangerGoalsFor_last_5_games","mediumDangerGoalsFor_last_5_games", 
                                    "highDangerGoalsFor_last_5_games", "lowDangerGoalsAgainst_last_5_games",
                                    "mediumDangerGoalsAgainst_last_5_games", "highDangerGoalsAgainst_last_5_games", 
                                    "lowDangerShotsFor_last_5_games", "mediumDangerShotsFor_last_5_games",
                                    "highDangerShotsFor_last_5_games", "lowDangerShotsAgainst_last_5_games", 
                                    "mediumDangerShotsAgainst_last_5_games","highDangerShotsAgainst_last_5_games",
                                    "savedShotsOnGoalFor_last_5_games","savedShotsOnGoalAgainst_last_5_games",
                                    "fiveOnfiveFenwickPercentage_last_5_games", "fiveOnfiveCorsi_last_5_games"])

gamelog_avg_df

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,result,home_away,shotsOnGoalFor_last_5_games,blockedShotAttemptsFor_last_5_games,goalsFor_last_5_games,...,takeawaysAgainst_last_5_games,giveawaysAgainst_last_5_games,powerPlayxGoalsFor_last_5_games,powerPlayxGoalsAgainst_last_5_games,shortHandedxGoalsAgainst_last_5_games,otherxGoalsFor_last_5_games,otherxGoalsAgainst_last_5_games,shortHandedxGoalsFor_last_5_games,fiveOnFivexGoalsFor_last_5_games,fiveOnFivexGoalsAgainst_last_5_games
0,NYR,2022,2022020003,TBL,20221011,1,1,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0000
1,NYR,2022,2022020017,MIN,20221013,1,0,39.000000,22.0,3.000000,...,8.000000,15.000000,2.2300,0.000000,0.088000,0.875000,1.38700,0.372000,2.112000,1.2790
2,NYR,2022,2022020023,WPG,20221014,0,0,37.000000,17.5,5.000000,...,8.500000,10.500000,1.5580,0.074000,0.490000,0.742500,1.48850,0.194000,1.882500,1.5530
3,NYR,2022,2022020039,ANA,20221017,1,1,38.333333,16.0,3.666667,...,9.666667,12.666667,1.0820,0.049333,0.486333,0.617333,1.35300,0.149333,2.007333,1.9220
4,NYR,2022,2022020064,SJS,20221020,0,1,39.500000,15.0,4.250000,...,8.250000,12.000000,1.0880,0.039500,0.370500,0.463000,1.01475,0.120500,2.155500,1.8405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,0,0,33.600000,26.4,3.800000,...,5.200000,13.000000,0.9152,0.041000,0.404000,0.910200,0.51700,0.017800,2.058000,1.9198
7310,CAR,2023,2023030223,NYR,20240509,0,1,37.200000,24.4,3.400000,...,5.200000,14.000000,0.6062,0.014800,0.623600,0.755400,0.36020,0.069400,2.211000,2.4112
7311,CAR,2023,2023030224,NYR,20240511,1,1,42.200000,23.2,3.200000,...,5.200000,12.000000,0.9514,0.076400,0.718200,0.840400,0.35220,0.069800,2.368000,2.2708
7312,CAR,2023,2023030225,NYR,20240513,1,0,39.600000,22.2,3.600000,...,7.000000,7.800000,0.7176,0.114600,0.620200,0.811200,0.35940,0.069800,2.403600,2.0404


### Exploratory Analysis

In [33]:
features = gamelog_avg_df.drop(columns=["Team", "Season", "gameId", "OpposingTeam", "gameDate","result"])
X = add_constant(features)

vif_df = pd.DataFrame()
vif_df["Feature"] = X.columns
vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Sort by VIF (highest first)
vif_df = vif_df.sort_values(by="VIF", ascending=False)

print(vif_df)

                                    Feature        VIF
0                                     const  64.551347
2               shotsOnGoalFor_last_5_games   4.232934
10          shotsOnGoalAgainst_last_5_games   4.180189
5            penaltyMinutesFor_last_5_games   3.590475
13       penaltyMinutesAgainst_last_5_games   3.539186
25     fiveOnFivexGoalsAgainst_last_5_games   2.894380
24         fiveOnFivexGoalsFor_last_5_games   2.796693
17            giveawaysAgainst_last_5_games   2.787074
9                 giveawaysFor_last_5_games   2.774549
6               faceOffsWonFor_last_5_games   2.573582
14          faceOffsWonAgainst_last_5_games   2.343221
15                 hitsAgainst_last_5_games   1.870044
3       blockedShotAttemptsFor_last_5_games   1.867396
7                      hitsFor_last_5_games   1.752700
16            takeawaysAgainst_last_5_games   1.742636
8                 takeawaysFor_last_5_games   1.734410
12                goalsAgainst_last_5_games   1.646415
4         

Based on the VIFs computed above, there are no features which have concerning levels of multicollinearity with other features.

In [37]:
numerical_feat = gamelog_avg_df.drop(columns = ["Team", "Season", "gameId", "OpposingTeam", "gameDate","home_away"])

In [None]:
train_df, test_df = train_test_split(rolling_avg_df, test_size =0.2, random_state=123)

In [None]:
train_df.sort_index()