In [1]:
DROP_COLS = [
    # Identifiers / labels
    "game_id",
    "date",
    "season",
    "home_team",
    "away_team",
    "home_team_abbrev",
    "away_team_abbrev",
    "matchup",

    # Target + post-game results
    "home_win",
    "home_gf",
    "away_gf",
    "home_ga",
    "away_ga",

    # H2H raw totals (dangerous leakage unless strictly pre-game)
    "home_h2h_gf",
    "away_h2h_gf",
]


In [2]:
TEAM_L5_COLS = [
    "home_gf_per_game_l5", "away_gf_per_game_l5",
    "home_ga_per_game_l5", "away_ga_per_game_l5",
    "home_sog_per_game_l5", "away_sog_per_game_l5",
    "home_win_pct_l5", "away_win_pct_l5",
    "home_powerplay_pct_l5", "away_powerplay_pct_l5",
    "home_penalty_kill_pct_l5", "away_penalty_kill_pct_l5",
    "home_faceoffwin_pct_l5", "away_faceoffwin_pct_l5",
    "home_hits_l5", "away_hits_l5",
    "home_blockedshots_l5", "away_blockedshots_l5",
    "home_giveaways_l5", "away_giveaways_l5",
    "home_takeaways_l5", "away_takeaways_l5",
]

GOALIE_L5_COLS = [
    "home_goalie_save_pct_l5", "away_goalie_save_pct_l5",
    "home_goalie_ga_l5", "away_goalie_ga_l5",
    "home_goalie_saves_l5", "away_goalie_saves_l5",
    "home_goalie_ev_sa_l5", "away_goalie_ev_sa_l5",
    "home_goalie_pp_sa_l5", "away_goalie_pp_sa_l5",
    "home_goalie_sh_sa_l5", "away_goalie_sh_sa_l5",
    "home_goalie_ev_ga_l5", "away_goalie_ev_ga_l5",
    "home_goalie_pp_ga_l5", "away_goalie_pp_ga_l5",
]

SEASON_COLS = [
    "home_win_pct_season", "away_win_pct_season",
    "home_gf_per_game_season", "away_gf_per_game_season",
    "home_pointPctg_season", "away_pointPctg_season",
    "home_home_win_pct", "away_away_win_pct",
]

DIFF_COLS = [
    "pointPctg_diff",
    "home_goal_diff_l5",
    "home_shot_diff_l5",
    "home_h2h_wins_diff",
]

REST_COLS = [
    "home_win_streak", "away_win_streak",
    "home_rest_days", "away_rest_days",
    "home_goalie_rest_days", "away_goalie_rest_days",
]

FEATURE_COLS = (
    TEAM_L5_COLS +
    GOALIE_L5_COLS +
    SEASON_COLS +
    DIFF_COLS +
    REST_COLS
)

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [4]:
df = pd.read_csv("./generated/data/nhl_games_data.csv", parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)

In [5]:
TARGET = "home_win"

X = df[FEATURE_COLS]
y = df[TARGET]

In [6]:
numeric_features = FEATURE_COLS

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median", add_indicator=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)


In [7]:
split_date = "2025-01-01"

train_idx = df["date"] < split_date
test_idx  = df["date"] >= split_date

X_train, X_test = X.loc[train_idx], X.loc[test_idx]
y_train, y_test = y.loc[train_idx], y.loc[test_idx]


In [8]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_leaf=20,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", rf)
])


In [9]:
model.fit(X_train, y_train)


In [10]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


Accuracy: 0.8339719029374202
ROC AUC: 0.9294057237090056
              precision    recall  f1-score   support

           0       0.84      0.78      0.81       709
           1       0.83      0.88      0.85       857

    accuracy                           0.83      1566
   macro avg       0.84      0.83      0.83      1566
weighted avg       0.83      0.83      0.83      1566



In [11]:
importances = model.named_steps["classifier"].feature_importances_

feature_names = model.named_steps["preprocessor"] \
    .transformers_[0][1] \
    .named_steps["imputer"] \
    .get_feature_names_out(FEATURE_COLS)

fi = pd.Series(importances, index=feature_names)
fi.sort_values(ascending=False).head(20)


away_win_streak            0.237861
home_win_streak            0.195754
home_home_win_pct          0.075252
away_away_win_pct          0.061964
pointPctg_diff             0.053939
away_win_pct_season        0.032486
away_pointPctg_season      0.032229
home_win_pct_season        0.031477
home_pointPctg_season      0.025425
away_gf_per_game_season    0.020937
home_gf_per_game_season    0.017595
away_blockedshots_l5       0.009265
away_sog_per_game_l5       0.008741
away_gf_per_game_l5        0.008063
home_goal_diff_l5          0.007896
home_win_pct_l5            0.007788
home_goalie_save_pct_l5    0.007307
home_shot_diff_l5          0.006976
home_goalie_ev_sa_l5       0.006728
away_faceoffwin_pct_l5     0.006727
dtype: float64

In [12]:
df_eval = df.copy()

df_eval["pred_proba"] = model.predict_proba(df_eval[FEATURE_COLS])[:, 1]
df_eval["pred_home_win"] = (df_eval["pred_proba"] >= 0.5).astype(int)


In [13]:
last_10 = (
    df_eval
    .sort_values("date")
    .tail(20)
    [["date",
      "home_team_abbrev",
      "away_team_abbrev",
      "home_win",
      "pred_home_win",
      "pred_proba"]]
)

last_10


Unnamed: 0,date,home_team_abbrev,away_team_abbrev,home_win,pred_home_win,pred_proba
3457,2026-01-27,STL,DAL,0,0,0.15956
3458,2026-01-27,MIN,CHI,1,1,0.683867
3460,2026-01-28,CBJ,PHI,1,1,0.826463
3461,2026-01-28,OTT,COL,1,1,0.648742
3462,2026-01-28,NYI,NYR,1,1,0.847962
3472,2026-01-29,TBL,WPG,1,1,0.894967
3473,2026-01-29,DET,WSH,0,1,0.560042
3476,2026-01-29,NYR,NYI,0,0,0.163588
3475,2026-01-29,BUF,LAK,1,1,0.843415
3471,2026-01-29,NJD,NSH,1,1,0.579197
