In [2]:
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    RandomizedSearchCV,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

### Data Cleaning and Preprocessing

In [3]:
gamelog_df = pd.read_csv("data/gamelogs.csv")
gamelog_df.shape

(7314, 67)

We first drop `goalsFor`, `goalsAgainst` since they are essentially what we are predicting. Furthermore, Some of the features are redundant and will me perfectly collinear with one another since a few features are a combination of context specific features (e.g. xGoalsFor is a combination of powerplay, shorthanded other and 5v5 xgoals). Thus, we will drop  `lowDangerxGoalsFor`,`mediumDangerxGoalsFor`, `highDangerxGoalsFor`, `lowDangerxGoalsAgainst`, `mediumDangerxGoalsAgainst`, `highDangerxGoalsAgainst`, `lowDangerGoalsFor`,`mediumDangerGoalsFor`, `highDangerGoalsFor`, `lowDangerGoalsAgainst`, `mediumDangerGoalsAgainst`, `highDangerGoalsAgainst`, `lowDangerShotsFor`, `mediumDangerShotsFor`, `highDangerShotsFor`, `lowDangerShotsAgainst`, `mediumDangerShotsAgainst`, and `highDangerShotsAgainst` since expected goals already accounts for shot quality thus shot quality specifc data would be redundant. Moreover, we also drop `savedShotsOnGoalFor`, and `savedShotsOnGoalAgainst` since they are dependent on shots on goal for/against. We also remove all corsi and fenwick percentage columns as they are a combination of shots on goal, shots blocked and other attempted shots; however, blocked and missed shot attempts likely do not have strong preditcive power for whether a team wins or loses thus we will focus on shots on goal and blocked shots specifically.

In [4]:
gamelog_features = gamelog_df.drop(columns = ["home_or_away", "situation","fenwickPercentage", "corsiPercentage", "xGoalsFor",
                                    "xGoalsAgainst", "goalsFor", "goalsAgainst", "last_10_avg_xgoals_for", 
                                    "last_10_avg_xgoals_against","lowDangerxGoalsFor","mediumDangerxGoalsFor", 
                                    "highDangerxGoalsFor", "lowDangerxGoalsAgainst",
                                    "mediumDangerxGoalsAgainst", "highDangerxGoalsAgainst",
                                    "lowDangerGoalsFor","mediumDangerGoalsFor", 
                                    "highDangerGoalsFor", "lowDangerGoalsAgainst",
                                    "mediumDangerGoalsAgainst", "highDangerGoalsAgainst", 
                                    "lowDangerShotsFor", "mediumDangerShotsFor",
                                    "highDangerShotsFor", "lowDangerShotsAgainst", 
                                    "mediumDangerShotsAgainst","highDangerShotsAgainst",
                                    "savedShotsOnGoalFor","savedShotsOnGoalAgainst",
                                    "fiveOnfiveFenwickPercentage", "fiveOnfiveCorsiPercentage"])

gamelog_features

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,shotsOnGoalFor,blockedShotAttemptsFor,penaltyMinutesFor,faceOffsWonFor,hitsFor,...,fiveOnFivexGoalsFor,fiveOnFivexGoalsAgainst,result,home_away,last_10_avg_goals_for,last_10_avg_goals_against,opp_last_10_goals_for,opp_last_10_goals_against,opp_last_10_xgoals_against,opp_last_10_xgoals_for
0,NYR,2022,2022020003,TBL,20221011,39,22,12,30,24,...,2.112,1.279,1,1,,,,,,
1,NYR,2022,2022020017,MIN,20221013,35,13,17,29,27,...,1.653,1.827,1,0,3.000000,1.000000,,,,
2,NYR,2022,2022020023,WPG,20221014,41,13,6,28,25,...,2.257,2.660,0,0,5.000000,2.000000,,,,
3,NYR,2022,2022020039,ANA,20221017,43,12,2,30,27,...,2.600,1.596,1,1,3.666667,2.666667,3.0,5.5,4.4490,2.3725
4,NYR,2022,2022020064,SJS,20221020,23,11,6,24,33,...,2.120,1.668,0,1,4.250000,3.000000,1.6,3.8,3.1180,2.5904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,57,29,16,50,29,...,3.463,3.196,0,0,3.800000,2.700000,3.3,2.2,2.8051,3.7086
7310,CAR,2023,2023030223,NYR,20240509,47,17,14,39,31,...,1.780,1.507,0,1,3.700000,3.000000,3.2,2.3,3.0097,3.7584
7311,CAR,2023,2023030224,NYR,20240511,31,18,4,23,28,...,3.370,2.290,1,1,3.400000,3.100000,3.3,2.1,3.0895,3.6717
7312,CAR,2023,2023030225,NYR,20240513,28,25,6,17,23,...,2.419,1.627,1,0,3.400000,3.200000,3.5,2.1,3.2085,3.5165


In [5]:
features = gamelog_features.columns[27:35].tolist()

features

['result',
 'home_away',
 'last_10_avg_goals_for',
 'last_10_avg_goals_against',
 'opp_last_10_goals_for',
 'opp_last_10_goals_against',
 'opp_last_10_xgoals_against',
 'opp_last_10_xgoals_for']

In [6]:
# create new dataframe that contains rolling averages of data as well as necessary categorical data
rolling_avg_df = pd.DataFrame()

columns = gamelog_features.columns[0:5].tolist() + gamelog_features.columns[27:35].tolist()

for column in columns:
    rolling_avg_df[column] = gamelog_features[column]

columns_to_avg = gamelog_features.columns[5:27].tolist()

for column in columns_to_avg:
    rolling_avg_df[f'last_10_avg_{column}'] = (
        gamelog_features.groupby(["Team", "Season"], group_keys=False)[column].apply(lambda x: x.shift(1).rolling(window=10, min_periods=1).mean())
    )

rolling_avg_df

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,result,home_away,last_10_avg_goals_for,last_10_avg_goals_against,opp_last_10_goals_for,...,last_10_avg_takeawaysAgainst,last_10_avg_giveawaysAgainst,last_10_avg_powerPlayxGoalsFor,last_10_avg_powerPlayxGoalsAgainst,last_10_avg_shortHandedxGoalsAgainst,last_10_avg_otherxGoalsFor,last_10_avg_otherxGoalsAgainst,last_10_avg_shortHandedxGoalsFor,last_10_avg_fiveOnFivexGoalsFor,last_10_avg_fiveOnFivexGoalsAgainst
0,NYR,2022,2022020003,TBL,20221011,1,1,,,,...,,,,,,,,,,
1,NYR,2022,2022020017,MIN,20221013,1,0,3.000000,1.000000,,...,8.000000,15.000000,2.2300,0.000000,0.088000,0.875000,1.38700,0.372000,2.112000,1.2790
2,NYR,2022,2022020023,WPG,20221014,0,0,5.000000,2.000000,,...,8.500000,10.500000,1.5580,0.074000,0.490000,0.742500,1.48850,0.194000,1.882500,1.5530
3,NYR,2022,2022020039,ANA,20221017,1,1,3.666667,2.666667,3.0,...,9.666667,12.666667,1.0820,0.049333,0.486333,0.617333,1.35300,0.149333,2.007333,1.9220
4,NYR,2022,2022020064,SJS,20221020,0,1,4.250000,3.000000,1.6,...,8.250000,12.000000,1.0880,0.039500,0.370500,0.463000,1.01475,0.120500,2.155500,1.8405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,0,0,3.800000,2.700000,3.3,...,6.500000,11.300000,0.7531,0.025300,0.371400,0.789800,0.27510,0.057800,2.230400,1.8038
7310,CAR,2023,2023030223,NYR,20240509,0,1,3.700000,3.000000,3.2,...,6.400000,10.200000,0.7762,0.025300,0.496800,0.807300,0.31420,0.060800,2.334800,1.9777
7311,CAR,2023,2023030224,NYR,20240511,1,1,3.400000,3.100000,3.3,...,5.900000,10.600000,0.9437,0.056100,0.544000,0.675200,0.30740,0.062000,2.285200,1.9411
7312,CAR,2023,2023030225,NYR,20240513,1,0,3.400000,3.200000,3.5,...,6.300000,9.000000,0.8008,0.076300,0.485000,0.590100,0.31100,0.043300,2.368100,2.0712


There are some null values in the data frame; however, they are a result of previous games in that season not yet existing which has resulted in their rolling averages to be null. Thus, we will fill all those missing values with 0 since there were no prior games that season and using rolling averages from the previous season does not make sense due to teams having different rosters each season.

In [28]:
rolling_avg_df.fillna(0 ,inplace = True)

rolling_avg_df

Unnamed: 0,Team,Season,gameId,OpposingTeam,gameDate,result,home_away,last_10_avg_goals_for,last_10_avg_goals_against,opp_last_10_goals_for,...,last_10_avg_takeawaysAgainst,last_10_avg_giveawaysAgainst,last_10_avg_powerPlayxGoalsFor,last_10_avg_powerPlayxGoalsAgainst,last_10_avg_shortHandedxGoalsAgainst,last_10_avg_otherxGoalsFor,last_10_avg_otherxGoalsAgainst,last_10_avg_shortHandedxGoalsFor,last_10_avg_fiveOnFivexGoalsFor,last_10_avg_fiveOnFivexGoalsAgainst
0,NYR,2022,2022020003,TBL,20221011,1,1,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0000
1,NYR,2022,2022020017,MIN,20221013,1,0,3.000000,1.000000,0.0,...,8.000000,15.000000,2.2300,0.000000,0.088000,0.875000,1.38700,0.372000,2.112000,1.2790
2,NYR,2022,2022020023,WPG,20221014,0,0,5.000000,2.000000,0.0,...,8.500000,10.500000,1.5580,0.074000,0.490000,0.742500,1.48850,0.194000,1.882500,1.5530
3,NYR,2022,2022020039,ANA,20221017,1,1,3.666667,2.666667,3.0,...,9.666667,12.666667,1.0820,0.049333,0.486333,0.617333,1.35300,0.149333,2.007333,1.9220
4,NYR,2022,2022020064,SJS,20221020,0,1,4.250000,3.000000,1.6,...,8.250000,12.000000,1.0880,0.039500,0.370500,0.463000,1.01475,0.120500,2.155500,1.8405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7309,CAR,2023,2023030222,NYR,20240507,0,0,3.800000,2.700000,3.3,...,6.500000,11.300000,0.7531,0.025300,0.371400,0.789800,0.27510,0.057800,2.230400,1.8038
7310,CAR,2023,2023030223,NYR,20240509,0,1,3.700000,3.000000,3.2,...,6.400000,10.200000,0.7762,0.025300,0.496800,0.807300,0.31420,0.060800,2.334800,1.9777
7311,CAR,2023,2023030224,NYR,20240511,1,1,3.400000,3.100000,3.3,...,5.900000,10.600000,0.9437,0.056100,0.544000,0.675200,0.30740,0.062000,2.285200,1.9411
7312,CAR,2023,2023030225,NYR,20240513,1,0,3.400000,3.200000,3.5,...,6.300000,9.000000,0.8008,0.076300,0.485000,0.590100,0.31100,0.043300,2.368100,2.0712


### Exploratory Analysis

In [8]:
features = rolling_avg_df.drop(columns=["Team", "Season", "gameId", "OpposingTeam", "gameDate","result"])
X = add_constant(features)

vif_df = pd.DataFrame()
vif_df["Feature"] = X.columns
vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Sort by VIF (highest first)
vif_df = vif_df.sort_values(by="VIF", ascending=False)

print(vif_df)

                                   Feature        VIF
0                                    const  75.906931
8               last_10_avg_shotsOnGoalFor   5.823555
15          last_10_avg_shotsOnGoalAgainst   5.697130
17       last_10_avg_penaltyMinutesAgainst   4.226468
10           last_10_avg_penaltyMinutesFor   4.151741
21            last_10_avg_giveawaysAgainst   4.045575
14                last_10_avg_giveawaysFor   3.968646
29     last_10_avg_fiveOnFivexGoalsAgainst   3.749685
28         last_10_avg_fiveOnFivexGoalsFor   3.552351
11              last_10_avg_faceOffsWonFor   3.408885
18          last_10_avg_faceOffsWonAgainst   2.948344
9       last_10_avg_blockedShotAttemptsFor   2.305547
20            last_10_avg_takeawaysAgainst   2.136201
13                last_10_avg_takeawaysFor   2.117899
19                 last_10_avg_hitsAgainst   2.075747
2                    last_10_avg_goals_for   1.936196
3                last_10_avg_goals_against   1.845998
12                     last_

Based on the VIFs computed above, there are no features which have concerning levels of multicollinearity with other features. 

### Training The Model

 We will first split the data into training and test sets

In [None]:
train_df, test_df = train_test_split(rolling_avg_df, test_size =0.2, random_state=123)

In [None]:
X_train = train_df.drop(columns = ['result', "Team", "Season", "gameId","OpposingTeam", "gameDate"])
y_train = train_df['result']

X_test = test_df.drop(columns = ['result', "Team", "Season", "gameId","OpposingTeam", "gameDate"])
y_test = test_df['result']

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

X = train_df.drop(columns=['result', "Team", "Season", "gameId","OpposingTeam", "gameDate"])
y = train_df['result']  # Binary target

X_cont = X.select_dtypes(include=['float64', 'int64']) 
for col in X_cont.columns:
    X[f'{col}_log_interact'] = X[col] * np.log(X[col] + 1) #interaction terms between feature and log feature

X = sm.add_constant(X)
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())

Based on the results from box-tidwell test, no feature violates the assumption of linearity as none of the coefficients for interaction between features and the log features are statistically significant (p-value is greater than 0.05).

Now 

In [None]:
train_df.sort_index()