In [57]:
DROP_COLS = [
    # Identifiers (not predictive)
    'game_id', 'date', 'season', 'home_team', 'away_team', 
    'home_team_abbrev', 'away_team_abbrev', 'matchup',
    
    # Target variable
    'home_win',
    
    # Game outcome stats (only known AFTER the game - data leakage)
    'home_gf', 'away_gf', 'home_ga', 'away_ga', 'home_sog', 'away_sog',
    'home_faceoffwin_pct', 'away_faceoffwin_pct',
    'home_powerplays', 'away_powerplays', 'home_powerplay_pct', 'away_powerplay_pct',
    'home_pk', 'away_pk', 'home_pk_pct', 'away_pk_pct',
    'home_pims', 'away_pims', 'home_hits', 'away_hits',
    'home_blockedshots', 'away_blockedshots', 'home_takeaways', 'away_takeaways',
    'home_giveaways', 'away_giveaways',
    
    # Goalie stats from THIS game
    'home_save_pct', 'away_save_pct',
    'home_goalie_save_pct', 'away_goalie_save_pct',
    'home_goalie_ga', 'away_goalie_ga',
    'home_goalie_saves', 'away_goalie_saves',
    'home_goalie_evenStrengthShotsAgainst', 'away_goalie_evenStrengthShotsAgainst',
    'home_goalie_powerPlayShotsAgainst', 'away_goalie_powerPlayShotsAgainst',
    'home_goalie_shorthandedShotsAgainst', 'away_goalie_shorthandedShotsAgainst',
    'home_goalie_evenStrengthGoalsAgainst', 'away_goalie_evenStrengthGoalsAgainst',
    'home_goalie_powerPlayGoalsAgainst', 'away_goalie_powerPlayGoalsAgainst',
]


In [58]:
HOME_TEAM_L5_COLS = [
    'home_gf_per_game_l5', 'home_ga_per_game_l5', 'home_sog_per_game_l5',
    'home_wins_l5', 'home_win_pct_l5', 'home_powerplay_pct_l5',
    'home_penalty_kill_pct_l5', 'home_powerplay_opps_l5', 'home_pk_opps_l5',
    'home_faceoffwin_pct_l5', 'home_pims_l5', 'home_hits_l5',
    'home_blockedshots_l5', 'home_giveaways_l5', 'home_takeaways_l5',
]

AWAY_TEAM_L5_COLS = [
    'away_gf_per_game_l5', 'away_ga_per_game_l5', 'away_sog_per_game_l5',
    'away_wins_l5', 'away_win_pct_l5', 'away_powerplay_pct_l5',
    'away_penalty_kill_pct_l5', 'away_powerplay_opps_l5', 'away_pk_opps_l5',
    'away_faceoffwin_pct_l5', 'away_pims_l5', 'away_hits_l5',
    'away_blockedshots_l5', 'away_giveaways_l5', 'away_takeaways_l5',
]

GOALIE_L5_COLS = [
    'home_goalie_save_pct_l5', 'home_goalie_ga_l5', 'home_goalie_saves_l5',
    'home_goalie_ev_sa_l5', 'home_goalie_pp_sa_l5', 'home_goalie_sh_sa_l5',
    'home_goalie_ev_ga_l5', 'home_goalie_pp_ga_l5', 
    'away_goalie_save_pct_l5', 'away_goalie_ga_l5', 'away_goalie_saves_l5',
    'away_goalie_ev_sa_l5', 'away_goalie_pp_sa_l5', 'away_goalie_sh_sa_l5',
    'away_goalie_ev_ga_l5', 'away_goalie_pp_ga_l5',
]

TEAM_GOALIE_PERFORMANCE = [
    'home_team_save_pct_l5', 'away_team_save_pct_l5',
]

SEASON_COLS = [
    'home_win_pct_season', 'away_win_pct_season',
    'home_home_win_pct', 'away_away_win_pct',
    'home_gf_per_game_season', 'away_gf_per_game_season',
    'home_pointPctg_season', 'away_pointPctg_season', 'pointPctg_diff',
]

DIFF_COLS = [
    'home_goal_diff_l5', 'home_ga_diff_l5', 'home_shot_diff_l5',
]

STREAKS_AND_REST = [
    'home_win_streak', 'away_win_streak',
    'home_rest_days', 'away_rest_days',
    'home_goalie_rest_days', 'away_goalie_rest_days',
]

HEAD_TO_HEAD = [
    'home_h2h_wins', 'home_h2h_gf', 'away_h2h_wins', 
    'away_h2h_gf', 'home_h2h_wins_diff',
]

FEATURE_COLS = (
    HOME_TEAM_L5_COLS +
    AWAY_TEAM_L5_COLS +
    GOALIE_L5_COLS +
    TEAM_GOALIE_PERFORMANCE +
    SEASON_COLS +
    DIFF_COLS +
    STREAKS_AND_REST +
    HEAD_TO_HEAD
)


In [59]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


In [60]:
df = pd.read_csv("../scripts/generated/data/nhl_data.csv", parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)

In [61]:
# Create a cleaned feature list without goalie starter columns
FINAL_FEATURE_COLS = [f for f in FEATURE_COLS if f not in ['home_goalie_starter', 'away_goalie_starter']]

In [62]:
TARGET = "home_win"

X = df[FINAL_FEATURE_COLS]
y = df[TARGET]

In [63]:
numeric_features = FINAL_FEATURE_COLS

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median", add_indicator=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)


In [64]:
split_date = "2025-04-17"

train_idx = df["date"] < split_date
test_idx  = df["date"] >= split_date

X_train, X_test = X.loc[train_idx], X.loc[test_idx]
y_train, y_test = y.loc[train_idx], y.loc[test_idx]


In [65]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_leaf=20,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", rf)
])


In [66]:
model.fit(X_train, y_train)


In [67]:
# ============================================================
# EVALUATION
# ============================================================

def evaluate_model_comprehensive(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("Accuracy    :", accuracy_score(y_test, y_pred))
    print("ROC-AUC     :", roc_auc_score(y_test, y_proba))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred, target_names=["Away Win", "Home Win"]))

    buckets = [
        ("Very High (>70%)", y_proba > 0.70),
        ("High (60–70%)", (y_proba > 0.60) & (y_proba <= 0.70)),
        ("Medium (55–60%)", (y_proba > 0.55) & (y_proba <= 0.60)),
        ("Low (<55%)", y_proba <= 0.55),
    ]

    print("\nConfidence Buckets:")
    for label, mask in buckets:
        if mask.sum() > 0:
            print(f"{label:20s}: {accuracy_score(y_test[mask], y_pred[mask]):.3f} ({mask.sum()} games)")
        else:
            print(f"{label:20s}: N/A")

    return y_proba


test_probas = evaluate_model_comprehensive(model, X_test, y_test)


Accuracy    : 0.8407281001137656
ROC-AUC     : 0.9364841456710473

Classification Report:

              precision    recall  f1-score   support

    Away Win       0.87      0.78      0.82       421
    Home Win       0.82      0.90      0.85       458

    accuracy                           0.84       879
   macro avg       0.85      0.84      0.84       879
weighted avg       0.84      0.84      0.84       879


Confidence Buckets:
Very High (>70%)    : 0.989 (263 games)
High (60–70%)       : 0.724 (105 games)
Medium (55–60%)     : 0.543 (70 games)
Low (<55%)          : 0.828 (441 games)


In [68]:
# ============================================================
# FEATURE IMPORTANCE
# ============================================================

def analyze_feature_importance(model, feature_names, top_n=25):
    clf = model.named_steps["classifier"]
    imputer = model.named_steps["preprocessor"].named_transformers_["num"].named_steps["imputer"]

    expanded_features = imputer.get_feature_names_out(feature_names)

    fi = pd.DataFrame({
        "feature": expanded_features,
        "importance": clf.feature_importances_
    }).sort_values("importance", ascending=False)

    print("\nTop Features:")
    for _, row in fi.head(top_n).iterrows():
        print(f"{row.feature:55s} {row.importance:.4f}")

    return fi


fi_df = analyze_feature_importance(model, FINAL_FEATURE_COLS)



Top Features:
away_win_streak                                         0.2390
home_win_streak                                         0.2089
home_home_win_pct                                       0.0576
away_away_win_pct                                       0.0572
pointPctg_diff                                          0.0469
home_win_pct_season                                     0.0339
away_win_pct_season                                     0.0320
home_pointPctg_season                                   0.0265
away_pointPctg_season                                   0.0245
away_gf_per_game_season                                 0.0184
home_gf_per_game_season                                 0.0154
home_goalie_save_pct_l5                                 0.0076
away_blockedshots_l5                                    0.0073
home_win_pct_l5                                         0.0070
away_sog_per_game_l5                                    0.0068
home_team_save_pct_l5                   

In [None]:
df_eval = df.copy()
df_eval["pred_proba"] = model.predict_proba(df_eval[FINAL_FEATURE_COLS])[:, 1]
df_eval["pred_home_win"] = (df_eval["pred_proba"] >= 0.5).astype(int)

last_20 = (
    df_eval
    .sort_values("date")
    .tail(20)
    [["date", "home_team_abbrev", "away_team_abbrev",
      "home_win", "pred_home_win", "pred_proba"]]
)

last_20

Unnamed: 0,date,home_team_abbrev,away_team_abbrev,home_win,pred_home_win,pred_proba
3468,2026-01-29,VAN,ANA,1,0,0.41333
3467,2026-01-29,SEA,TOR,1,1,0.778532
3465,2026-01-29,STL,FLA,1,0,0.451335
3472,2026-01-29,BUF,LAK,1,1,0.848895
3480,2026-01-30,CHI,CBJ,0,0,0.202441
3493,2026-01-31,VGK,SEA,0,0,0.305518
3492,2026-01-31,PHI,LAK,0,1,0.542594
3491,2026-01-31,DET,COL,0,0,0.456066
3490,2026-01-31,PIT,NYR,1,1,0.79124
3489,2026-01-31,CGY,SJS,1,1,0.533841
