In [42]:
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    RandomizedSearchCV,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

### Data Cleaning and Preprocessing

In [43]:
gamelog_df = pd.read_csv("data/gamelogs.csv")
gamelog_df.shape

(7314, 67)

We first drop `goalsFor`, `goalsAgainst` since they are essentially what we are predicting. Furthermore, Some of the features are redundant and will me perfectly collinear with one another since a few features are a combination of context specific features (e.g. xGoalsFor is a combination of powerplay, shorthanded other and 5v5 xgoals). Thus, we will drop  `lowDangerxGoalsFor`,`mediumDangerxGoalsFor`, `highDangerxGoalsFor`, `lowDangerxGoalsAgainst`, `mediumDangerxGoalsAgainst`, `highDangerxGoalsAgainst`, `lowDangerGoalsFor`,`mediumDangerGoalsFor`, `highDangerGoalsFor`, `lowDangerGoalsAgainst`, `mediumDangerGoalsAgainst`, `highDangerGoalsAgainst`, `lowDangerShotsFor`, `mediumDangerShotsFor`, `highDangerShotsFor`, `lowDangerShotsAgainst`, `mediumDangerShotsAgainst`, and `highDangerShotsAgainst` since expected goals already accounts for shot quality thus shot quality specifc data would be redundant. Moreover, we also drop `savedShotsOnGoalFor`, and `savedShotsOnGoalAgainst` since they are dependent on shots on goal for/against. We also remove all corsi and fenwick percentage columns as they are a combination of shots on goal, shots blocked and other attempted shots; however, blocked and missed shot attempts likely do not have strong preditcive power for whether a team wins or loses thus we will focus on shots on goal and blocked shots specifically.

In [54]:
gamelog_features = gamelog_df.drop(columns = ["home_or_away", "situation","fenwickPercentage", "corsiPercentage",
                                    "goalsFor", "goalsAgainst",
                                    "lowDangerxGoalsFor","mediumDangerxGoalsFor", 
                                    "highDangerxGoalsFor", "lowDangerxGoalsAgainst",
                                    "mediumDangerxGoalsAgainst", "highDangerxGoalsAgainst",
                                    "lowDangerGoalsFor","mediumDangerGoalsFor", 
                                    "highDangerGoalsFor", "lowDangerGoalsAgainst",
                                    "mediumDangerGoalsAgainst", "highDangerGoalsAgainst", 
                                    "lowDangerShotsFor", "mediumDangerShotsFor",
                                    "highDangerShotsFor", "lowDangerShotsAgainst", 
                                    "mediumDangerShotsAgainst","highDangerShotsAgainst",
                                    "savedShotsOnGoalFor","savedShotsOnGoalAgainst",
                                    "fiveOnfiveFenwickPercentage", "fiveOnfiveCorsiPercentage"])

gamelog_features

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,xGoalsFor,shotsOnGoalFor,blockedShotAttemptsFor,penaltyMinutesFor,faceOffsWonFor,...,last_10_avg_xgoals_for,last_10_avg_xgoals_against,result,home_away,last_10_avg_goals_for,last_10_avg_goals_against,opp_last_10_goals_for,opp_last_10_goals_against,opp_last_10_xgoals_against,opp_last_10_xgoals_for
0,NYR,2022,2022020003,TBL,20221011,5.589,39,22,12,30,...,,,1,1,,,,,,
1,NYR,2022,2022020017,MIN,20221013,3.165,35,13,17,29,...,5.58900,2.754000,1,0,3.000000,1.000000,,,,
2,NYR,2022,2022020023,WPG,20221014,2.814,41,13,6,28,...,4.37700,3.605500,0,0,5.000000,2.000000,,,,
3,NYR,2022,2022020039,ANA,20221017,3.741,43,12,2,30,...,3.85600,3.810667,1,1,3.666667,2.666667,3.0,5.5,4.4490,2.3725
4,NYR,2022,2022020064,SJS,20221020,2.970,23,11,6,24,...,3.82725,3.265000,0,1,4.250000,3.000000,1.6,3.8,3.1180,2.5904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,4.333,57,29,16,50,...,3.83100,2.475300,0,0,3.800000,2.700000,3.3,2.2,2.8051,3.7086
7310,CAR,2023,2023030223,NYR,20240509,4.027,47,17,14,39,...,3.97880,2.813800,0,1,3.700000,3.000000,3.2,2.3,3.0097,3.7584
7311,CAR,2023,2023030224,NYR,20240511,3.661,31,18,4,23,...,3.96580,2.848500,1,1,3.400000,3.100000,3.3,2.1,3.0895,3.6717
7312,CAR,2023,2023030225,NYR,20240513,3.172,28,25,6,17,...,3.80200,2.943400,1,0,3.400000,3.200000,3.5,2.1,3.2085,3.5165


In [None]:
# create new dataframe that contains rolling averages of data as well as necessary categorical data
rolling_avg_df = pd.DataFrame()

features = gamelog_df.columns[0:5].tolist() + gamelog_df.columns[59:61].tolist()

for feature in features:
    rolling_avg_df[feature] = gamelog_df[feature]

columns_to_avg = gamelog_df.columns[7:57].tolist()

rolling_window = 5


for column in columns_to_avg:
    rolling_avg_df[f'{column}'] = (
        gamelog_df.groupby(["Team", "Season"], group_keys=False)[column].apply(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
    )

    first_game_mask = gamelog_df.groupby(["Team", "Season"]).cumcount() == 0
    rolling_avg_df.loc[first_game_mask, f'{column} '] = 0

There are some null values in the data frame; however, they are a result of previous games in that season not yet existing which has resulted in their rolling averages to be null. Thus, we will fill all those missing values with 0 since there were no prior games that season and using rolling averages from the previous season does not make sense due to teams having different rosters each season.

In [55]:
gamelog_features.fillna(0 ,inplace = True)

gamelog_features

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,xGoalsFor,shotsOnGoalFor,blockedShotAttemptsFor,penaltyMinutesFor,faceOffsWonFor,...,last_10_avg_xgoals_for,last_10_avg_xgoals_against,result,home_away,last_10_avg_goals_for,last_10_avg_goals_against,opp_last_10_goals_for,opp_last_10_goals_against,opp_last_10_xgoals_against,opp_last_10_xgoals_for
0,NYR,2022,2022020003,TBL,20221011,5.589,39,22,12,30,...,0.00000,0.000000,1,1,0.000000,0.000000,0.0,0.0,0.0000,0.0000
1,NYR,2022,2022020017,MIN,20221013,3.165,35,13,17,29,...,5.58900,2.754000,1,0,3.000000,1.000000,0.0,0.0,0.0000,0.0000
2,NYR,2022,2022020023,WPG,20221014,2.814,41,13,6,28,...,4.37700,3.605500,0,0,5.000000,2.000000,0.0,0.0,0.0000,0.0000
3,NYR,2022,2022020039,ANA,20221017,3.741,43,12,2,30,...,3.85600,3.810667,1,1,3.666667,2.666667,3.0,5.5,4.4490,2.3725
4,NYR,2022,2022020064,SJS,20221020,2.970,23,11,6,24,...,3.82725,3.265000,0,1,4.250000,3.000000,1.6,3.8,3.1180,2.5904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,4.333,57,29,16,50,...,3.83100,2.475300,0,0,3.800000,2.700000,3.3,2.2,2.8051,3.7086
7310,CAR,2023,2023030223,NYR,20240509,4.027,47,17,14,39,...,3.97880,2.813800,0,1,3.700000,3.000000,3.2,2.3,3.0097,3.7584
7311,CAR,2023,2023030224,NYR,20240511,3.661,31,18,4,23,...,3.96580,2.848500,1,1,3.400000,3.100000,3.3,2.1,3.0895,3.6717
7312,CAR,2023,2023030225,NYR,20240513,3.172,28,25,6,17,...,3.80200,2.943400,1,0,3.400000,3.200000,3.5,2.1,3.2085,3.5165


### Exploratory Analysis

In [56]:
variables = gamelog_features.drop(columns=["Team", "Season", "gameId", "OpposingTeam", "gameDate","result"])
X = add_constant(variables)

vif_df = pd.DataFrame()
vif_df["Feature"] = X.columns
vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Sort by VIF (highest first)
vif_df = vif_df.sort_values(by="VIF", ascending=False)

print(vif_df)

                       Feature           VIF
1                    xGoalsFor  3.005272e+06
9                xGoalsAgainst  3.005272e+06
24     fiveOnFivexGoalsAgainst  1.438085e+06
23         fiveOnFivexGoalsFor  1.438085e+06
17          powerPlayxGoalsFor  8.489691e+05
19    shortHandedxGoalsAgainst  8.489691e+05
21          otherxGoalsAgainst  7.278115e+05
20              otherxGoalsFor  7.278115e+05
22        shortHandedxGoalsFor  4.656233e+04
18      powerPlayxGoalsAgainst  4.656233e+04
0                        const  1.715191e+02
4            penaltyMinutesFor  2.778489e+00
12       penaltyMinutesAgainst  2.778489e+00
2               shotsOnGoalFor  2.002197e+00
10          shotsOnGoalAgainst  2.002197e+00
25      last_10_avg_xgoals_for  1.866031e+00
33      opp_last_10_xgoals_for  1.866031e+00
32  opp_last_10_xgoals_against  1.832617e+00
26  last_10_avg_xgoals_against  1.832617e+00
28       last_10_avg_goals_for  1.652683e+00
30       opp_last_10_goals_for  1.652683e+00
29   last_

Based on the VIFs computed above, there are no features which have concerning levels of multicollinearity with other features.

In [37]:
numerical_feat = gamelog_avg_df.drop(columns = ["Team", "Season", "gameId", "OpposingTeam", "gameDate","home_away"])

In [None]:
train_df, test_df = train_test_split(rolling_avg_df, test_size =0.2, random_state=123)

In [None]:
train_df.sort_index()