In [None]:
# compare_models.py
import os
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score,
    log_loss,
    accuracy_score,
    roc_curve,
)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import xgboost as xgb

# --- import your model cores ---
from lstm import StatSeqConfig, StatFromScratchBinary
from tft import (
    TFTConfig,
    build_tft_model,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ======================================================
# 1. Shared ECE / PCE utilities (same for all models)
# ======================================================

def calculate_ece(y_true, y_pred_proba, n_bins=10):
    """
    Expected Calibration Error (ECE) over [0,1] with equal-width bins.
    """
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)

    bins = np.linspace(0, 1, n_bins + 1)
    bin_indices = np.digitize(y_pred_proba, bins) - 1
    bin_indices = np.clip(bin_indices, 0, n_bins - 1)

    ece = 0.0
    for i in range(n_bins):
        mask = bin_indices == i
        if mask.sum() == 0:
            continue
        bin_size = mask.sum()
        bin_conf = y_pred_proba[mask].mean()
        bin_acc = y_true[mask].mean()
        ece += (bin_size / len(y_true)) * abs(bin_conf - bin_acc)
    return ece


def calculate_pce(y_true, y_pred_proba, n_bins=10):
    """
    Positive Calibration Error (ECE computed on probs >= 0.5).
    """
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)

    mask_pos = y_pred_proba >= 0.5
    if mask_pos.sum() == 0:
        return 0.0

    return calculate_ece(y_true[mask_pos], y_pred_proba[mask_pos], n_bins=n_bins)


# ======================================================
# 2. Data prep: SAME DATASET for all models
# ======================================================

def prepare_player_prop_data(df, stat_col='REC', prop_threshold=5.0, additional_stat_cols=None):
    """
    This is copied from your XGBoost prep logic & used for ALL models.
    We return:
      - prepared_df with engineered features
      - feature_cols used as inputs
    """
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['athlete_id', 'date']).reset_index(drop=True)

    if additional_stat_cols is None:
        additional_stat_cols = ['REC', 'YDS', 'TD', 'TGTS']

    for col in additional_stat_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.dropna(subset=[stat_col])
    df['prop_outcome'] = (df[stat_col] > prop_threshold).astype(int)

    # Label encoders
    le_team = LabelEncoder()
    le_opp = LabelEncoder()
    le_pos = LabelEncoder()

    df['team_encoded'] = le_team.fit_transform(df['team'].fillna('UNK'))
    df['opposing_team_encoded'] = le_opp.fit_transform(df['opposing_team'].fillna('UNK'))
    df['position_encoded'] = le_pos.fit_transform(df['position'].fillna('UNK'))
    df['home_away_encoded'] = (df['home_away'] == 'home').astype(int)

    feature_cols = ['team_encoded', 'opposing_team_encoded', 'position_encoded',
                    'home_away_encoded', 'season']

    stat_lower = stat_col.lower()

    # Rolling for target stat
    for window in [3, 5, 8]:
        col_name = f'{stat_lower}_rolling_{window}'
        df[col_name] = df.groupby('athlete_id')[stat_col].transform(
            lambda x: x.rolling(window, min_periods=1).mean().shift(1)
        )
        feature_cols.append(col_name)

    # Rolling for additional stats
    for add_stat in additional_stat_cols:
        if add_stat != stat_col and add_stat in df.columns:
            for window in [3, 8]:
                col_name = f'{add_stat.lower()}_rolling_{window}'
                df[col_name] = df.groupby('athlete_id')[add_stat].transform(
                    lambda x: x.rolling(window, min_periods=1).mean().shift(1)
                )
                feature_cols.append(col_name)

    # Hit rate
    df['prop_hit_rate_8'] = df.groupby('athlete_id')['prop_outcome'].transform(
        lambda x: x.rolling(8, min_periods=1).mean().shift(1)
    )
    feature_cols.append('prop_hit_rate_8')

    # Season avg
    df[f'{stat_lower}_season_avg'] = df.groupby(['athlete_id', 'season'])[stat_col].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    feature_cols.append(f'{stat_lower}_season_avg')

    # Season prop hit rate
    df['prop_season_hit_rate'] = df.groupby(['athlete_id', 'season'])['prop_outcome'].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    feature_cols.append('prop_season_hit_rate')

    # Volatility
    df[f'{stat_lower}_std_8'] = df.groupby('athlete_id')[stat_col].transform(
        lambda x: x.rolling(8, min_periods=2).std().shift(1)
    )
    feature_cols.append(f'{stat_lower}_std_8')

    # Trend
    df['trend'] = df[f'{stat_lower}_rolling_3'] - df[f'{stat_lower}_season_avg']
    feature_cols.append('trend')

    # Game counters
    df['game_num'] = df.groupby(['athlete_id', 'season']).cumcount() + 1
    feature_cols.append('game_num')

    df['career_games'] = df.groupby('athlete_id').cumcount()
    feature_cols.append('career_games')

    df = df.fillna(0)
    feature_cols = list(dict.fromkeys(feature_cols))  # dedupe

    encoders = {
        'team': le_team,
        'opponent': le_opp,
        'position': le_pos,
    }

    return df, feature_cols, encoders




Prepared rows: 22412, features: 21
Train size: 17929, Test size: 4483

===== TRAINING XGBOOST =====
XGBoost metrics: {'log_loss': 0.2929649594389824, 'accuracy': 0.8860138300245372, 'roc_auc': 0.8209194705809622, 'ece': np.float64(0.01717255837939466), 'pce': np.float64(0.05027068028083215)}

===== TRAINING LSTM =====
[LSTM] Epoch 1/10 - Train Loss: 0.3758
[LSTM] Epoch 2/10 - Train Loss: 0.3377
[LSTM] Epoch 3/10 - Train Loss: 0.3285
[LSTM] Epoch 4/10 - Train Loss: 0.3187
[LSTM] Epoch 5/10 - Train Loss: 0.3184
[LSTM] Epoch 6/10 - Train Loss: 0.3127
[LSTM] Epoch 7/10 - Train Loss: 0.3116
[LSTM] Epoch 8/10 - Train Loss: 0.3083
[LSTM] Epoch 9/10 - Train Loss: 0.3082
[LSTM] Epoch 10/10 - Train Loss: 0.3074
LSTM metrics: {'log_loss': 0.3042736849709536, 'accuracy': 0.8802141423154138, 'roc_auc': 0.8159678471280348, 'ece': np.float64(0.030727734433176685), 'pce': 0.0}

===== TRAINING TFT =====
[TFT] Epoch 1/10 - Train Loss: 0.3736
[TFT] Epoch 2/10 - Train Loss: 0.3431
[TFT] Epoch 3/10 - Train

In [14]:
# ======================================================
# 3. XGBoost training on shared dataset
# ======================================================

def train_xgb_on_shared_data(X_train, y_train, X_test, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    params = {
        'objective': 'binary:logistic',
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'eval_metric': ['logloss', 'auc'],
        'seed': 42,
    }

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtest, 'test')],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    y_pred_proba = model.predict(dtest)
    y_pred = (y_pred_proba > 0.5).astype(int)

    metrics = {
        'log_loss': log_loss(y_test, y_pred_proba),
        'accuracy': accuracy_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba),
        'ece': calculate_ece(y_test, y_pred_proba),
        'pce': calculate_pce(y_test, y_pred_proba),
    }
    return model, metrics, y_pred_proba

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [None]:
# ======================================================
# 4. LSTM dataset + training on SAME rows
# ======================================================

class LSTMTabularDataset(Dataset):
    """
    Each sample is treated as a sequence of length 1 with features X[i].
    This lets us reuse your custom LSTM while using the exact same rows.
    """
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]  # (F,)
        # Make it (T=1, F)
        x_seq = x.unsqueeze(0)
        length = torch.tensor(1, dtype=torch.long)
        y = self.y[idx]
        return x_seq, length, y


def train_lstm_on_shared_data(X_train, y_train, X_test, y_test, hidden_size=128, n_epochs=10, batch_size=64, lr=1e-3):
    train_ds = LSTMTabularDataset(X_train, y_train)
    test_ds = LSTMTabularDataset(X_test, y_test)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    input_size = X_train.shape[1]
    model = StatFromScratchBinary(input_size=input_size, hidden_size=hidden_size).to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, n_epochs + 1):
        model.train()
        running_loss = 0.0

        for x_seq, lengths, y in train_dl:
            x_seq = x_seq.to(device)         # (B,1,F)
            lengths = lengths.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            logits = model(x_seq, lengths)
            loss = criterion(logits, y)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_loss += loss.item() * y.size(0)

        avg_loss = running_loss / len(train_ds)
        print(f"[LSTM] Epoch {epoch}/{n_epochs} - Train Loss: {avg_loss:.4f}")

    # --- Evaluation ---
    model.eval()
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for x_seq, lengths, y in test_dl:
            x_seq = x_seq.to(device)
            lengths = lengths.to(device)
            y = y.to(device)

            logits = model(x_seq, lengths)
            probs = torch.sigmoid(logits)

            all_probs.append(probs.cpu().numpy())
            all_labels.append(y.cpu().numpy())

    y_pred_proba = np.concatenate(all_probs)
    y_test_np = np.concatenate(all_labels)
    y_pred = (y_pred_proba > 0.5).astype(int)

    metrics = {
        'log_loss': log_loss(y_test_np, y_pred_proba),
        'accuracy': accuracy_score(y_test_np, y_pred),
        'roc_auc': roc_auc_score(y_test_np, y_pred_proba),
        'ece': calculate_ece(y_test_np, y_pred_proba),
        'pce': calculate_pce(y_test_np, y_pred_proba),
    }

    return model, metrics, y_pred_proba


In [None]:
# ======================================================
# 5. TFT dataset + training on SAME rows
# ======================================================

class TFTTabularDataset(Dataset):
    """
    Uses the same rows as X,y, but decomposes them into:
      - past_obs:   (1, n_past)
      - fut_known:  (1, n_future)
      - static_cat: (n_static_cat,)
      - y
    """
    def __init__(
        self,
        df: pd.DataFrame,
        feature_cols: List[str],
        static_cols: List[str],
        future_cols: List[str],
        target_col: str = "prop_outcome",
    ):
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.static_cols = static_cols
        self.future_cols = future_cols
        self.target_col = target_col

        # define past features as: all features not in static or future
        self.past_cols = [
            c for c in feature_cols
            if c not in static_cols and c not in future_cols
        ]

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        past_obs_vec = row[self.past_cols].to_numpy(dtype=np.float32)
        fut_vec = row[self.future_cols].to_numpy(dtype=np.float32)
        static_vec = row[self.static_cols].to_numpy(dtype=np.int64)
        y = np.float32(row[self.target_col])

        # make them "sequence" shaped: (T=1, feats)
        past_obs = torch.tensor(past_obs_vec, dtype=torch.float32).unsqueeze(0)  # (1,n_past)
        fut_known = torch.tensor(fut_vec, dtype=torch.float32).unsqueeze(0)      # (1,n_future)
        static_cat = torch.tensor(static_vec, dtype=torch.long)                  # (n_static_cat,)

        return past_obs, fut_known, static_cat, torch.tensor(y, dtype=torch.float32)


def train_tft_on_shared_data(
    prepared_df: pd.DataFrame,
    feature_cols: List[str],
    split_idx: int,
    n_epochs: int = 10,
    batch_size: int = 64,
    hidden_size: int = 64,
    num_heads: int = 2,
    dropout: float = 0.1,
):
    """
    Train TFT on the exact same prepared_df rows, with same train/test split index.
    """

    # 1. define which columns are static & future
    static_cols = ["team_encoded", "opposing_team_encoded", "position_encoded", "home_away_encoded"]
    future_cols = ["season", "team_encoded", "opposing_team_encoded", "home_away_encoded", "position_encoded"]

    # cardinalities for static embeddings
    static_cardinalities = [prepared_df[c].nunique() for c in static_cols]

    # 2. split df into train/test by index (same split as XGB & LSTM)
    df_train = prepared_df.iloc[:split_idx].reset_index(drop=True)
    df_test = prepared_df.iloc[split_idx:].reset_index(drop=True)

    train_ds = TFTTabularDataset(df_train, feature_cols, static_cols, future_cols, target_col="prop_outcome")
    test_ds = TFTTabularDataset(df_test, feature_cols, static_cols, future_cols, target_col="prop_outcome")

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    # 3. build TFT model from your core
    tft_cfg = TFTConfig(
        hidden_size=hidden_size,
        num_heads=num_heads,
        dropout=dropout,
    )

    model = build_tft_model(
        cfg=tft_cfg,
        n_static_cat=len(static_cols),
        n_future=len(future_cols),
        n_past=len(train_ds.past_cols),
        static_cardinalities=static_cardinalities,
        device=device,
    )

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(1, n_epochs + 1):
        model.train()
        running_loss = 0.0

        for past_obs, fut_known, static_cat, y in train_dl:
            past_obs = past_obs.to(device)   # (B,1,n_past)
            fut_known = fut_known.to(device) # (B,1,n_future)
            static_cat = static_cat.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            logits = model(past_obs, fut_known, static_cat)
            loss = criterion(logits, y)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_loss += loss.item() * y.size(0)

        avg_loss = running_loss / len(train_ds)
        print(f"[TFT] Epoch {epoch}/{n_epochs} - Train Loss: {avg_loss:.4f}")

    # --- Evaluation on test ---
    model.eval()
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for past_obs, fut_known, static_cat, y in test_dl:
            past_obs = past_obs.to(device)
            fut_known = fut_known.to(device)
            static_cat = static_cat.to(device)
            y = y.to(device)

            logits = model(past_obs, fut_known, static_cat)
            probs = torch.sigmoid(logits)

            all_probs.append(probs.cpu().numpy())
            all_labels.append(y.cpu().numpy())

    y_pred_proba = np.concatenate(all_probs)
    y_test_np = np.concatenate(all_labels)
    y_pred = (y_pred_proba > 0.5).astype(int)

    metrics = {
        'log_loss': log_loss(y_test_np, y_pred_proba),
        'accuracy': accuracy_score(y_test_np, y_pred),
        'roc_auc': roc_auc_score(y_test_np, y_pred_proba),
        'ece': calculate_ece(y_test_np, y_pred_proba),
        'pce': calculate_pce(y_test_np, y_pred_proba),
    }

    return model, metrics, y_pred_proba

In [17]:
pd.read_csv("data/fumbles_2019_2023.csv")

Unnamed: 0,game_id,date,season,team,home_away,opposing_team,athlete_id,display_name,position,FUM,LOST,REC
0,401127972,2019-09-08,2019,Indianapolis Colts,away,Los Angeles Chargers,2578570,Jacoby Brissett,QB,1,0,0
1,401127972,2019-09-08,2019,Indianapolis Colts,away,Los Angeles Chargers,3050199,George Odum,S,0,0,1
2,401127972,2019-09-08,2019,Los Angeles Chargers,home,Indianapolis Colts,3040145,Desmond King II,CB,1,1,0
3,401127972,2019-09-08,2019,Los Angeles Chargers,home,Indianapolis Colts,5529,Philip Rivers,QB,1,0,0
4,401127995,2019-09-08,2019,Cincinnati Bengals,away,Seattle Seahawks,14012,Andy Dalton,QB,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4801,401547645,2024-01-07,2023,Kansas City Chiefs,away,Los Angeles Chargers,3122899,Richie James,WR,1,0,1
4802,401547645,2024-01-07,2023,Kansas City Chiefs,away,Los Angeles Chargers,3155647,Mike Edwards,S,0,0,1
4803,401547645,2024-01-07,2023,Los Angeles Chargers,home,Kansas City Chiefs,3120590,Easton Stick,QB,1,1,0
4804,401547645,2024-01-07,2023,Los Angeles Chargers,home,Kansas City Chiefs,3068267,Austin Ekeler,RB,1,0,0


In [18]:
pd.read_csv("data/passing_2019_2023.csv")

Unnamed: 0,game_id,date,season,team,home_away,opposing_team,athlete_id,display_name,position,C_ATT,YDS,AVG,TD,INT,SACKS,QBR,RTG
0,401127972,2019-09-08,2019,Indianapolis Colts,away,Los Angeles Chargers,2578570,Jacoby Brissett,QB,21/27,190,7.0,2,0,2-17,60.5,120.7
1,401127972,2019-09-08,2019,Los Angeles Chargers,home,Indianapolis Colts,5529,Philip Rivers,QB,25/34,333,9.8,3,1,4-23,53.0,121.3
2,401127995,2019-09-08,2019,Cincinnati Bengals,away,Seattle Seahawks,14012,Andy Dalton,QB,35/51,418,8.2,2,0,5-23,56.2,106.5
3,401127995,2019-09-08,2019,Seattle Seahawks,home,Cincinnati Bengals,14881,Russell Wilson,QB,14/20,195,9.8,2,0,4-35,51.2,134.4
4,401127954,2019-09-08,2019,Baltimore Ravens,away,Miami Dolphins,3916387,Lamar Jackson,QB,17/20,324,16.2,5,0,1-1,99.5,158.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3445,401547650,2024-01-07,2023,Philadelphia Eagles,away,New York Giants,4040715,Jalen Hurts,QB,7/16,55,3.4,0,1,2-26,13.9,26.8
3446,401547650,2024-01-07,2023,New York Giants,home,Philadelphia Eagles,14163,Tyrod Taylor,QB,23/32,297,9.3,1,1,1-5,48.2,98.0
3447,401547650,2024-01-07,2023,New York Giants,home,Philadelphia Eagles,4240391,Tommy DeVito,QB,2/2,14,7.0,0,0,1-0,35.3,95.8
3448,401547645,2024-01-07,2023,Kansas City Chiefs,away,Los Angeles Chargers,13987,Blaine Gabbert,QB,15/30,154,5.1,0,1,1-9,52.3,51.2


In [31]:
pd.read_csv("data/punting_2019_2023.csv")

Unnamed: 0,game_id,date,season,team,home_away,opposing_team,athlete_id,display_name,position,NO,YDS,AVG,TB,IN_20,LONG
0,401127972,2019-09-08,2019,Indianapolis Colts,away,Los Angeles Chargers,3914922,Rigoberto Sanchez,P,3,108,36.0,0,0,49
1,401127972,2019-09-08,2019,Los Angeles Chargers,home,Indianapolis Colts,2582324,Ty Long,P,2,98,49.0,0,2,54
2,401127995,2019-09-08,2019,Cincinnati Bengals,away,Seattle Seahawks,12669,Kevin Huber,P,4,177,44.3,1,1,51
3,401127995,2019-09-08,2019,Seattle Seahawks,home,Cincinnati Bengals,3929851,Michael Dickson,P,8,376,47.0,0,2,60
4,401127954,2019-09-08,2019,Baltimore Ravens,away,Miami Dolphins,9789,Sam Koch,P,1,56,56.0,0,1,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2740,401547653,2024-01-07,2023,Washington Commanders,home,Dallas Cowboys,16166,Tress Way,P,1,50,50.0,0,0,50
2741,401547650,2024-01-07,2023,Philadelphia Eagles,away,New York Giants,4035239,Braden Mann,P,4,201,50.3,0,1,55
2742,401547650,2024-01-07,2023,New York Giants,home,Philadelphia Eagles,3936185,Jamie Gillan,P,5,228,45.6,0,4,55
2743,401547645,2024-01-07,2023,Kansas City Chiefs,away,Los Angeles Chargers,3915398,Tommy Townsend,P,6,274,45.7,2,3,61


In [None]:
dir = "data"

training = [
    "receiving_2019_2023.csv",
    "rushing_2019_2023.csv",
    "passing_2019_2023.csv",
    "defensive_2019_2023.csv",
    "fumbles_2019_2023.csv",
    "interceptions_2019_2023.csv",
    "kickreturns_2019_2023.csv",
    "puntreturn_2019_2023.csv",
    "kicking_2019_2023.csv",
    "punting_2019_2023.csv",
]

stats = {
    "receiving": ["REC", "YDS", "AVG","TD","LONG","TGTS"],
    "rushing":   ["CAR","YDS","AVG","TD","LONG"],
    "passing":   ["C_ATT",	"YDS","AVG","TD","INT","SACKS","QBR","RTG"],
    "defensive": ["TOT","SOLO","SACKS","TFL","PD","QB_HTS","TD"],
    "fumbles":   ["FUM", "LOST","REC"],
    "interceptions": ["INT","YDS","TD"],
    "kickreturns": ["NO","YDS","AVG","LONG","TD"],
    "puntreturn": ["NO","YDS","AVG","LONG","TD"],
    "kicking": ["FG","PCT","LONG","XP","PTS"],
    "punting":  ["NO","YDS","AVG","TB","IN_20","LONG"],
}

results = []


In [44]:
pd.read_csv("data/passing_2019_2023.csv")

Unnamed: 0,game_id,date,season,team,home_away,opposing_team,athlete_id,display_name,position,C_ATT,YDS,AVG,TD,INT,SACKS,QBR,RTG
0,401127972,2019-09-08,2019,Indianapolis Colts,away,Los Angeles Chargers,2578570,Jacoby Brissett,QB,21/27,190,7.0,2,0,2-17,60.5,120.7
1,401127972,2019-09-08,2019,Los Angeles Chargers,home,Indianapolis Colts,5529,Philip Rivers,QB,25/34,333,9.8,3,1,4-23,53.0,121.3
2,401127995,2019-09-08,2019,Cincinnati Bengals,away,Seattle Seahawks,14012,Andy Dalton,QB,35/51,418,8.2,2,0,5-23,56.2,106.5
3,401127995,2019-09-08,2019,Seattle Seahawks,home,Cincinnati Bengals,14881,Russell Wilson,QB,14/20,195,9.8,2,0,4-35,51.2,134.4
4,401127954,2019-09-08,2019,Baltimore Ravens,away,Miami Dolphins,3916387,Lamar Jackson,QB,17/20,324,16.2,5,0,1-1,99.5,158.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3445,401547650,2024-01-07,2023,Philadelphia Eagles,away,New York Giants,4040715,Jalen Hurts,QB,7/16,55,3.4,0,1,2-26,13.9,26.8
3446,401547650,2024-01-07,2023,New York Giants,home,Philadelphia Eagles,14163,Tyrod Taylor,QB,23/32,297,9.3,1,1,1-5,48.2,98.0
3447,401547650,2024-01-07,2023,New York Giants,home,Philadelphia Eagles,4240391,Tommy DeVito,QB,2/2,14,7.0,0,0,1-0,35.3,95.8
3448,401547645,2024-01-07,2023,Kansas City Chiefs,away,Los Angeles Chargers,13987,Blaine Gabbert,QB,15/30,154,5.1,0,1,1-9,52.3,51.2


In [42]:
dir = "data"

training = [
    "receiving_2019_2023.csv",
    "rushing_2019_2023.csv",
    "passing_2019_2023.csv",
]

stats = {
    "receiving": ["REC", "YDS", "AVG","TD","LONG","TGTS"],
    "rushing":   ["CAR","YDS","AVG","TD","LONG"],
    "passing":   ["C_ATT",	"YDS","AVG","TD","INT","SACKS","QBR","RTG"],
}

results = []

for filename in training:

    data_path = os.path.join(dir, filename)
    category = filename.split("_")[0]


    print("\n" + "="*80)
    print(f"LOADING: {filename}   (category: {category})")
    print("="*80)

    df_raw = pd.read_csv(data_path)
    
    if data_path == os.path.join(dir, "passing_2019_2023.csv"):
        df_raw = pd.read_csv(data_path)
        df_raw['C_ATT'] = pd.to_float(df_raw['C_ATT'], errors='coerce')


    
    available_stats = [col for col in df_raw.columns if col in stats[category]]

    print(f"Stats available in file: {available_stats}")

    for stat_col in available_stats:

        threshold = df_raw[stat_col].median()

        print(f"\n--- TRAINING PROP: {category.upper()} {stat_col} OVER {threshold:.1f} ---")

        try:
            prepared_df, feature_cols, _ = prepare_player_prop_data(
                df_raw,
                stat_col=stat_col,
                prop_threshold=threshold,
                additional_stat_cols=None, 
            )
        except Exception as e:
            print(f"Skipping {stat_col}: preparation error: {e}")
            continue

        if len(prepared_df) < 200:
            print(f"Skipping {stat_col}: only {len(prepared_df)} usable rows")
            continue

        X = prepared_df[feature_cols].values
        y = prepared_df["prop_outcome"].values.astype(np.float32)

        split_idx = int(len(X) * 0.8)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]

        try:
            xgb_model, xgb_metrics, _ = train_xgb_on_shared_data(
                X_train, y_train, X_test, y_test
            )
            results.append({
                "category": category,
                "stat": stat_col,
                "threshold": threshold,
                "model": "XGBoost",
                **xgb_metrics,
            })
        except Exception as e:
            print(f"XGBoost failed: {e}")


        try:
            lstm_model, lstm_metrics, _ = train_lstm_on_shared_data(
                X_train, y_train, X_test, y_test,
                hidden_size=128,
                n_epochs=10,
                batch_size=64,
                lr=1e-3,
            )
            results.append({
                "category": category,
                "stat": stat_col,
                "threshold": threshold,
                "model": "LSTM",
                **lstm_metrics,
            })
        except Exception as e:
            print(f"LSTM failed: {e}")

        try:
            tft_model, tft_metrics, _ = train_tft_on_shared_data(
                prepared_df=prepared_df,
                feature_cols=feature_cols,
                split_idx=split_idx,
                n_epochs=10,
                batch_size=64,
                hidden_size=64,
                num_heads=2,
                dropout=0.1,
            )
            results.append({
                "category": category,
                "stat": stat_col,
                "threshold": threshold,
                "model": "TFT",
                **tft_metrics,
            })
        except Exception as e:
            print(f"TFT failed: {e}")


results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("TRAINING COMPLETE — SUMMARY")
print("="*80)
print(results_df.to_markdown(index=False, floatfmt=".4f"))


LOADING: receiving_2019_2023.csv   (category: receiving)
Stats available in file: ['REC', 'YDS', 'AVG', 'TD', 'LONG', 'TGTS']

--- TRAINING PROP: RECEIVING REC OVER 2.0 ---
[LSTM] Epoch 1/10 - Train Loss: 0.6100
[LSTM] Epoch 2/10 - Train Loss: 0.5809
[LSTM] Epoch 3/10 - Train Loss: 0.5741
[LSTM] Epoch 4/10 - Train Loss: 0.5717
[LSTM] Epoch 5/10 - Train Loss: 0.5715
[LSTM] Epoch 6/10 - Train Loss: 0.5733
[LSTM] Epoch 7/10 - Train Loss: 0.5710
[LSTM] Epoch 8/10 - Train Loss: 0.5695
[LSTM] Epoch 9/10 - Train Loss: 0.5698
[LSTM] Epoch 10/10 - Train Loss: 0.5713
[TFT] Epoch 1/10 - Train Loss: 0.6222
[TFT] Epoch 2/10 - Train Loss: 0.6009
[TFT] Epoch 3/10 - Train Loss: 0.5978


KeyboardInterrupt: 

In [68]:
# ===============================================
#  FULL SIDE-BY-SIDE MODEL COMPARISON PIPELINE
# ===============================================

import os
import re
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import xgboost as xgb

from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from lstm import StatFromScratchBinary
from tft import build_tft_model, TFTConfig
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ======================================================
#  HELPER: Compute ECE (Expected Calibration Error)
# ======================================================
def calculate_ece(y_true, y_pred_proba, n_bins=10):
    bins = np.linspace(0, 1, n_bins + 1)
    bin_ids = np.digitize(y_pred_proba, bins) - 1
    ece = 0.0
    for b in range(n_bins):
        mask = bin_ids == b
        if mask.sum() == 0:
            continue
        acc = y_true[mask].mean()
        conf = y_pred_proba[mask].mean()
        ece += (mask.sum() / len(y_true)) * abs(acc - conf)
    return float(ece)


# ======================================================
#  HELPER: Infer LSTM input_size from checkpoint
# ======================================================
def load_lstm_with_inferred_size(path):
    ckpt = torch.load(path, map_location="cpu")

    W = ckpt["sequence.cell.W.weight"]  # shape: (4H, H+F)
    fourH, HplusF = W.shape
    hidden_size = fourH // 4
    input_size = HplusF - hidden_size

    print(f"Loading LSTM → inferred input_size={input_size}, hidden_size={hidden_size}")

    model = StatFromScratchBinary(
        input_size=input_size,
        hidden_size=hidden_size
    )
    model.load_state_dict(ckpt)
    model.to(device)
    model.eval()
    return model


# ======================================================
#  HELPER: Evaluate model predictions
# ======================================================
def evaluate_metrics(y_true, y_prob):
    y_pred = (y_prob >= 0.5).astype(int)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_prob),
        "log_loss": log_loss(y_true, y_prob),
        "ece": calculate_ece(y_true, y_prob),
    }


# ======================================================
#  LOAD MODELS FROM DIR
# ======================================================
model_dir = "trained_models"
model_files = sorted(os.listdir(model_dir))

results = []

# ======================================================
#  FOR EACH MODEL, PARSE: category, stat, threshold
# ======================================================
pattern = r"(lstm|xgb|tft)_(\w+)_(\w+)_over_([0-9.]+)"

for fname in model_files:
    m = re.match(pattern, fname)
    if not m:
        continue

    mtype, category, stat_col, threshold_str = m.groups()
    threshold = pd.to_numeric(threshold_str, errors='coerce')
    if pd.isna(threshold):
        print(f"⚠️ Could not parse threshold from '{threshold_str}' — skipping model.")
        continue

    path = os.path.join(model_dir, fname)
    print("="*70)
    print(f"Evaluating {fname}")

    # --------------------------------------------------
    # Load the raw data associated with this model
    # --------------------------------------------------
    raw_path = os.path.join("data2", f"{category}_2019_2023.csv")
    if not os.path.exists(raw_path):
        print(f"Data not found: {raw_path}")
        continue

    df_raw = pd.read_csv(raw_path)

    # Ensure numeric
    df_raw[stat_col] = pd.to_numeric(df_raw[stat_col], errors="coerce")

    # --------------------------------------------------
    # Apply same preprocessing used during training
    # --------------------------------------------------
    from compare_models import prepare_player_prop_data  # Import your function

    prepared_df, feature_cols, _ = prepare_player_prop_data(
        df_raw,
        stat_col=stat_col,
        threshold=threshold
    )

    if len(prepared_df) < 200:
        print("Skipping — not enough rows.")
        continue

    X = prepared_df[feature_cols].values
    y = prepared_df["prop_outcome"].values.astype(float)

    split = int(0.8 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    # --------------------------------------------------
    # MODEL TYPE: XGBoost
    # --------------------------------------------------
    if mtype == "xgb":
        model = xgb.Booster()
        model.load_model(path)
        y_prob = model.predict(xgb.DMatrix(X_test))
        metrics = evaluate_metrics(y_test, y_prob)

    # --------------------------------------------------
    # MODEL TYPE: LSTM
    # --------------------------------------------------
    elif mtype == "lstm":
        model = load_lstm_with_inferred_size(path)
        with torch.no_grad():
            xt = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1).to(device)
            lengths = torch.ones(len(xt), dtype=torch.long).to(device)
            y_prob = torch.sigmoid(model(xt, lengths)).cpu().numpy()
        metrics = evaluate_metrics(y_test, y_prob)

    # --------------------------------------------------
    # MODEL TYPE: TFT
    # --------------------------------------------------
    elif mtype == "tft":
        # reconstruct TFT automatically:
        static_cols = ["team_encoded", "opposing_team_encoded", "position_encoded", "home_away_encoded"]
        future_cols = ["season", "team_encoded", "opposing_team_encoded", "home_away_encoded", "position_encoded"]

        from compare_models import TFTTabularDataset

        dataset = TFTTabularDataset(prepared_df, feature_cols, static_cols, future_cols)
        X_test_ds = TFTTabularDataset(prepared_df.iloc[split:], feature_cols, static_cols, future_cols)

        dl = torch.utils.data.DataLoader(X_test_ds, batch_size=64)

        static_cardinalities = [prepared_df[c].nunique() for c in static_cols]

        cfg = TFTConfig(hidden_size=64, dropout=0.1, num_heads=2)
        model = build_tft_model(cfg, len(static_cols), len(future_cols), len(dataset.past_cols), static_cardinalities)
        model.load_state_dict(torch.load(path, map_location=device))
        model.to(device)
        model.eval()

        probs = []
        with torch.no_grad():
            for past_obs, fut_known, static_cat, _ in dl:
                past_obs = past_obs.to(device)
                fut_known = fut_known.to(device)
                static_cat = static_cat.to(device)
                logits = model(past_obs, fut_known, static_cat)
                probs.append(torch.sigmoid(logits).cpu().numpy())
        y_prob = np.concatenate(probs)
        metrics = evaluate_metrics(y_test, y_prob)

    # --------------------------------------------------
    # Final: store results
    # --------------------------------------------------
    results.append({
        "category": category,
        "stat": stat_col,
        "threshold": threshold,
        "model": mtype.upper(),
        **metrics
    })


# ======================================================
#  RESULTS TABLE
# ======================================================
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(["category", "stat", "threshold", "model"])

print("\n" + "="*80)
print("SIDE-BY-SIDE MODEL COMPARISON")
print("="*80)
print(results_df.to_markdown(index=False, floatfmt=".4f"))

⚠️ Could not parse threshold from '6.8.' — skipping model.
⚠️ Could not parse threshold from '0.0.' — skipping model.
⚠️ Could not parse threshold from '88.2.' — skipping model.
⚠️ Could not parse threshold from '1.0.' — skipping model.
⚠️ Could not parse threshold from '213.0.' — skipping model.
⚠️ Could not parse threshold from '8.7.' — skipping model.
⚠️ Could not parse threshold from '12.0.' — skipping model.
⚠️ Could not parse threshold from '2.0.' — skipping model.
⚠️ Could not parse threshold from '0.0.' — skipping model.
⚠️ Could not parse threshold from '3.0.' — skipping model.
⚠️ Could not parse threshold from '20.0.' — skipping model.
⚠️ Could not parse threshold from '3.8.' — skipping model.
⚠️ Could not parse threshold from '4.0.' — skipping model.
⚠️ Could not parse threshold from '8.0.' — skipping model.
⚠️ Could not parse threshold from '0.0.' — skipping model.
⚠️ Could not parse threshold from '16.0.' — skipping model.
⚠️ Could not parse threshold from '6.8.' — skippin

KeyError: 'category'

In [3]:
import os

model_dir = "trained_models"
data_dir = "data"



Model file: lstm_passing_AVG_over_6.8.pt
Model file: tft_receiving_AVG_over_8.7.pt
Model file: xgb_receiving_TD_over_0.0.json
Model file: tft_receiving_YDS_over_20.0.pt
Model file: xgb_rushing_YDS_over_16.0.json
Model file: xgb_receiving_LONG_over_12.0.json
Model file: xgb_receiving_REC_over_2.0.json
Model file: lstm_receiving_AVG_over_8.7.pt
Model file: xgb_passing_TD_over_1.0.json
Model file: tft_passing_INT_over_0.0.pt
Model file: xgb_passing_AVG_over_6.8.json
Model file: tft_passing_RTG_over_88.2.pt
Model file: tft_rushing_AVG_over_3.8.pt
Model file: lstm_rushing_AVG_over_3.8.pt
Model file: tft_rushing_LONG_over_8.0.pt
Model file: lstm_rushing_TD_over_0.0.pt
Model file: xgb_receiving_TGTS_over_3.0.json
Model file: lstm_passing_INT_over_0.0.pt
Model file: tft_receiving_TD_over_0.0.pt
Model file: tft_rushing_TD_over_0.0.pt
Model file: lstm_receiving_TGTS_over_3.0.pt
Model file: xgb_passing_YDS_over_213.0.json
Model file: tft_passing_AVG_over_6.8.pt
Model file: xgb_passing_QBR_over_52