# Win/Loss modeling and tuning

Uses the engineered team-game dataset to train and compare a strong baseline logistic model and a tree ensemble (HistGradientBoosting) with chronological validation to avoid leakage.

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, brier_score_loss, classification_report
from sklearn.impute import SimpleImputer

# Paths
DATA_PATH = Path('../data/team_game_dataset.csv')
assert DATA_PATH.exists(), 'Run the EDA notebook to produce data/team_game_dataset.csv first.'

# Load and sort to keep chronology
raw = pd.read_csv(DATA_PATH)
raw['Data'] = pd.to_datetime(raw['Data'])
raw = raw.sort_values('Data').reset_index(drop=True)

# Target and feature split
y = raw['win']
X = raw.drop(columns=['win', 'Data'])

# Identify categorical and numeric columns
cat_cols = ['Tm', 'Opp']
num_cols = [c for c in X.columns if c not in cat_cols]

# Train/test chronological split (last 20% for test)
split_idx = int(len(raw) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]


## Preprocess and models
- ColumnTransformer: one-hot encode team IDs, impute missing, scale numerics.
- Logistic Regression (L2, balanced class weights).
- HistGradientBoosting (tree ensemble good for tabular).

In [15]:
# Preprocess
categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer([
    ('cat', categorical, cat_cols),
    ('num', numeric, num_cols)
])

# Models
log_reg = LogisticRegression(max_iter=200, class_weight='balanced')
hgb = HistGradientBoostingClassifier(max_depth=6, learning_rate=0.08, max_iter=300)

pipelines = {
    'log_reg': Pipeline([('prep', preprocess), ('clf', log_reg)]),
    'hgb': Pipeline([('prep', preprocess), ('clf', hgb)]),
}


## Train and evaluate
Metrics: accuracy, ROC-AUC, Brier score. Also view classification report for the best model.

In [16]:
results = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)
    results[name] = {
        'accuracy': accuracy_score(y_test, pred),
        'roc_auc': roc_auc_score(y_test, proba),
        'brier': brier_score_loss(y_test, proba)
    }

results_df = pd.DataFrame(results).T.sort_values('roc_auc', ascending=False)
print('Model comparison (higher accuracy/roc_auc, lower brier):')
print(results_df)

best_name = results_df.index[0]
best_pipe = pipelines[best_name]
best_pred = (best_pipe.predict_proba(X_test)[:, 1] >= 0.5).astype(int)
print(f"Best model: {best_name}")
print(classification_report(y_test, best_pred, digits=3))


Model comparison (higher accuracy/roc_auc, lower brier):
         accuracy  roc_auc         brier
log_reg       1.0      1.0  4.873060e-03
hgb           1.0      1.0  2.478946e-12
Best model: log_reg
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       153
           1      1.000     1.000     1.000       154

    accuracy                          1.000       307
   macro avg      1.000     1.000     1.000       307
weighted avg      1.000     1.000     1.000       307



## Grid search (time-aware)

Tune HistGradientBoosting hyperparameters with TimeSeriesSplit and select by ROC-AUC; evaluate the best on the held-out test split.


In [17]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

tscv = TimeSeriesSplit(n_splits=4)
param_grid_hgb = {
    'clf__max_depth': [4, 6, 8],
    'clf__learning_rate': [0.03, 0.05, 0.08, 0.12],
    'clf__max_iter': [200, 300, 500],
    'clf__min_samples_leaf': [20, 50, 100]
}

hgb_base = Pipeline([('prep', preprocess), ('clf', HistGradientBoostingClassifier())])

gs = GridSearchCV(
    estimator=hgb_base,
    param_grid=param_grid_hgb,
    cv=tscv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train, y_train)
print('Best params:', gs.best_params_)
print('Best CV ROC-AUC:', gs.best_score_)

best_hgb = gs.best_estimator_
proba = best_hgb.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)
print('Test ROC-AUC:', roc_auc_score(y_test, proba))
print('Test Accuracy:', accuracy_score(y_test, pred))
print('Test Brier:', brier_score_loss(y_test, proba))
print(classification_report(y_test, pred, digits=3))


Fitting 4 folds for each of 108 candidates, totalling 432 fits
Best params: {'clf__learning_rate': 0.03, 'clf__max_depth': 4, 'clf__max_iter': 200, 'clf__min_samples_leaf': 20}
Best CV ROC-AUC: 1.0
Test ROC-AUC: 1.0
Test Accuracy: 1.0
Test Brier: 1.5042070228258165e-06
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       153
           1      1.000     1.000     1.000       154

    accuracy                          1.000       307
   macro avg      1.000     1.000     1.000       307
weighted avg      1.000     1.000     1.000       307



##  Save best model

In [18]:
from joblib import dump
dump(best_pipe, '../models/win_classifier.joblib')
print('Saved best model to models/win_classifier.joblib')


Saved best model to models/win_classifier.joblib
