In [10]:
import torch
import numpy as np
import pandas as pd

import os, json
from datetime import datetime

from lstm import train_dual_head_classifier, TrainConfig
from tft import train_tft_classifier, TrainConfig
from data_prep import add_over_under_label, prepare_receiving_sequences
from metrics import compute_ece, compute_pace

from sklearn.metrics import roc_auc_score
from player_utils import predict_player_over_prob

from core_logic import compute_parlay_prob, load_data


In [15]:
train_df = pd.read_csv("data/receiving_2019_2023.csv")
test_df  = pd.read_csv("data/receiving_24tocurrent.csv")

LINE_VALUE = 80      
N_PAST_GAMES = 5
HIDDEN_SIZE = 128
STAT_COL  = "YDS"
D_MODEL = 128


train_df = add_over_under_label(train_df, stat_col=STAT_COL, line_value=LINE_VALUE, new_col="over_label")
test_df  = add_over_under_label(test_df,  stat_col=STAT_COL, line_value=LINE_VALUE, new_col="over_label")

# LSTM TEST

In [3]:
X_train, y_train, lengths_train, meta_train = prepare_receiving_sequences(
    train_df,
    n_past_games=N_PAST_GAMES,
    target_col="over_label",
)

X_test, y_test, lengths_test, meta_test = prepare_receiving_sequences(
    test_df,
    n_past_games=N_PAST_GAMES,
    target_col="over_label",
)

cfg = TrainConfig(
    n_epochs=10,
    batch_size=64,
    lr=1e-3,
    device="auto",
    verbose=True,
)

train_result = train_dual_head_classifier(
    X=X_train,
    y=y_train,
    lengths=lengths_train,
    hidden_size=HIDDEN_SIZE,
    cfg=cfg,
)

model = train_result["model"]
history = train_result["history"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
len_train_t = torch.tensor(lengths_train, dtype=torch.long).to(device)

with torch.no_grad():
    y_reg_train, logits_train = model(X_train_t, len_train_t)
    probs_train = torch.sigmoid(logits_train).cpu().numpy()

y_true_train = np.asarray(y_train)

auc_train   = roc_auc_score(y_true_train, probs_train)
ece_train   = compute_ece(y_true_train, probs_train)
pace2_train = compute_pace(y_true_train, probs_train, L=2)

print("\n=== Train Metrics (Single-Leg + Parlay) ===")
print(f"AUC_train   : {auc_train:.4f}")
print(f"ECE_train   : {ece_train:.4f}")
print(f"PaCE2_train : {pace2_train:.4f}  (random 2-leg parlays)")

X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
len_test_t = torch.tensor(lengths_test, dtype=torch.long).to(device)

with torch.no_grad():
    y_reg_test, logits_test = model(X_test_t, len_test_t)
    probs_test = torch.sigmoid(logits_test).cpu().numpy()

y_true_test = np.asarray(y_test)

auc_test   = roc_auc_score(y_true_test, probs_test)
ece_test   = compute_ece(y_true_test, probs_test)
pace2_test = compute_pace(y_true_test, probs_test, L=2)

print("\n=== Test Metrics (Single-Leg + Parlay) ===")
print(f"AUC_test   : {auc_test:.4f}")
print(f"ECE_test   : {ece_test:.4f}")
print(f"PaCE2_test : {pace2_test:.4f}  (random 2-leg parlays)")


os.makedirs("models", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

model_tag = f"lstm_dual_receiving_{STAT_COL.lower()}_line_{LINE_VALUE:.1f}_past{N_PAST_GAMES}_hid{HIDDEN_SIZE}"

model_path   = os.path.join("models",  model_tag + ".pt")
metrics_path = os.path.join("metrics", model_tag + "_metrics.json")

model_cpu = model.to("cpu")
torch.save(model_cpu.state_dict(), model_path)
print(f"Saved model to {model_path}")

metrics_payload = {
    "timestamp": datetime.now().isoformat(),
    "STAT_COL": STAT_COL,
    "line_value": LINE_VALUE,
    "n_past_games": N_PAST_GAMES,
    "hidden_size": HIDDEN_SIZE,
    "train_cfg": {
        "n_epochs": cfg.n_epochs,
        "batch_size": cfg.batch_size,
        "lr": cfg.lr,
        "device": cfg.device,
    },
    "train_history": history,  # per-epoch losses
    "train_metrics": {
        "auc": float(auc_train),
        "ece": float(ece_train),
        "pace2": float(pace2_train),
        "n_train": int(len(y_true_train)),
    },
    "test_metrics": {
        "auc": float(auc_test),
        "ece": float(ece_test),
        "pace2": float(pace2_test),
        "n_test": int(len(y_true_test)),
    },
}

with open(metrics_path, "w") as f:
    json.dump(metrics_payload, f, indent=2)

print(f"Saved metrics to {metrics_path}")

Epoch 01 | Train BCE loss: 0.2583
Epoch 02 | Train BCE loss: 0.2435
Epoch 03 | Train BCE loss: 0.2429
Epoch 04 | Train BCE loss: 0.2418
Epoch 05 | Train BCE loss: 0.2404
Epoch 06 | Train BCE loss: 0.2404
Epoch 07 | Train BCE loss: 0.2394
Epoch 08 | Train BCE loss: 0.2391
Epoch 09 | Train BCE loss: 0.2395
Epoch 10 | Train BCE loss: 0.2379

=== Train Metrics (Single-Leg + Parlay) ===
AUC_train   : 0.8419
ECE_train   : 0.0060
PaCE2_train : 0.0161  (random 2-leg parlays)

=== Test Metrics (Single-Leg + Parlay) ===
AUC_test   : 0.8161
ECE_test   : 0.0107
PaCE2_test : 0.0163  (random 2-leg parlays)
Saved model to models/lstm_dual_receiving_yds_line_80.0_past5_hid128.pt
Saved metrics to metrics/lstm_dual_receiving_yds_line_80.0_past5_hid128_metrics.json


## LSTM TEST for single leg probabilities

In [4]:
player_name = "George Kittle"

prob = predict_player_over_prob(
    model=model,
    df=test_df,              
    player_name=player_name,
    stat_col=STAT_COL,
    line_value=LINE_VALUE,
    n_past_games=N_PAST_GAMES,
)

print(f"Player: {player_name}")
print(f"Prop: {STAT_COL} over {LINE_VALUE}")
print(f"Predicted probability (model): {prob:.3f}")

Player: George Kittle
Prop: YDS over 80
Predicted probability (model): 0.018


## LSTM TEST for multi leg probabilities

In [5]:
yard_type = "Receiving" 

train_df, test_df, full_df = load_data(yard_type)

In [6]:
parlay_legs = [
    {
        "player": "George Kittle",
        "stat_col": "YDS",
        "line_value": 55.5,
    },
    {
        "player": "Brandon Aiyuk",
        "stat_col": "YDS",
        "line_value": 60.5,
    },
]

In [7]:
parlay_model_choice = "LSTM"  # or "TFT" or "XGBoost"

parlay_prob, leg_probs = compute_parlay_prob(
    parlay_legs=parlay_legs,
    yard_type=yard_type,
    parlay_model_choice=parlay_model_choice,
    train_df=train_df,
    test_df=test_df,
    full_df=full_df,
)

print(f"\nParlay model family: {parlay_model_choice}")
for leg, p in leg_probs:
    print(
        f"Leg: {leg['player']} – {leg['stat_col']} > {leg['line_value']} "
        f"→ P(hit) = {p:.3f}"
    )

print(f"\nP(all legs hit) = {parlay_prob:.3f}")


Parlay model family: LSTM
Leg: George Kittle – YDS > 55.5 → P(hit) = 0.156
Leg: Brandon Aiyuk – YDS > 60.5 → P(hit) = 0.448

P(all legs hit) = 0.070


# TFT TEST

In [16]:
train_df_labeled = add_over_under_label(
    df=train_df,
    stat_col=STAT_COL,
    line_value=LINE_VALUE,
    new_col="over_label",
)

test_df_labeled = add_over_under_label(
    df=test_df,
    stat_col=STAT_COL,
    line_value=LINE_VALUE,
    new_col="over_label",
)

# 2) Build sequences using that label
X_train, y_train, lengths_train, meta_train = prepare_receiving_sequences(
    train_df_labeled,
    n_past_games=N_PAST_GAMES,
    target_col="over_label",
)

X_test, y_test, lengths_test, meta_test = prepare_receiving_sequences(
    test_df_labeled,
    n_past_games=N_PAST_GAMES,
    target_col="over_label",
)

# 3) Train TFT classifier
cfg = TrainConfig(
    n_epochs=10,
    batch_size=64,
    lr=1e-3,
    device="auto",
    verbose=True,
)

train_result = train_tft_classifier(
    X=X_train,
    y=y_train,
    lengths=lengths_train,
    d_model=D_MODEL,
    n_heads=4,
    num_layers=2,
    dropout=0.1,
    cfg=cfg,
)

model   = train_result["model"]
history = train_result["history"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# 4) Train metrics
X_train_t  = torch.tensor(X_train, dtype=torch.float32).to(device)
len_train_t = torch.tensor(lengths_train, dtype=torch.long).to(device)

with torch.no_grad():
    y_reg_train, logits_train = model(X_train_t, len_train_t)
    probs_train = torch.sigmoid(logits_train).cpu().numpy()

y_true_train = np.asarray(y_train)

auc_train   = roc_auc_score(y_true_train, probs_train)
ece_train   = compute_ece(y_true_train, probs_train)
pace2_train = compute_pace(y_true_train, probs_train, L=2)

print("\n=== TFT Train Metrics (Single-Leg + Parlay) ===")
print(f"AUC_train   : {auc_train:.4f}")
print(f"ECE_train   : {ece_train:.4f}")
print(f"PaCE2_train : {pace2_train:.4f}  (random 2-leg parlays)")

# 5) Test metrics
X_test_t   = torch.tensor(X_test, dtype=torch.float32).to(device)
len_test_t = torch.tensor(lengths_test, dtype=torch.long).to(device)

with torch.no_grad():
    y_reg_test, logits_test = model(X_test_t, len_test_t)
    probs_test = torch.sigmoid(logits_test).cpu().numpy()

y_true_test = np.asarray(y_test)

auc_test   = roc_auc_score(y_true_test, probs_test)
ece_test   = compute_ece(y_true_test, probs_test)
pace2_test = compute_pace(y_true_test, probs_test, L=2)

print("\n=== TFT Test Metrics (Single-Leg + Parlay) ===")
print(f"AUC_test   : {auc_test:.4f}")
print(f"ECE_test   : {ece_test:.4f}")
print(f"PaCE2_test : {pace2_test:.4f}  (random 2-leg parlays)")

# 6) Save model + metrics
os.makedirs("models", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

model_tag   = f"tft_dual_receiving_{STAT_COL.lower()}_line_{LINE_VALUE:.1f}_past{N_PAST_GAMES}_dmodel{D_MODEL}"
model_path  = os.path.join("models",  model_tag + ".pt")
metrics_path = os.path.join("metrics", model_tag + "_metrics.json")

model_cpu = model.to("cpu")
torch.save(model_cpu.state_dict(), model_path)
print(f"Saved TFT model to {model_path}")

metrics_payload = {
    "timestamp": datetime.now().isoformat(),
    "stat_col": STAT_COL,
    "line_value": LINE_VALUE,
    "n_past_games": N_PAST_GAMES,
    "d_model": D_MODEL,
    "train_cfg": {
        "n_epochs": cfg.n_epochs,
        "batch_size": cfg.batch_size,
        "lr": cfg.lr,
        "device": cfg.device,
    },
    "train_history": history,
    "train_metrics": {
        "auc": float(auc_train),
        "ece": float(ece_train),
        "pace2": float(pace2_train),
        "n_train": int(len(y_true_train)),
    },
    "test_metrics": {
        "auc": float(auc_test),
        "ece": float(ece_test),
        "pace2": float(pace2_test),
        "n_test": int(len(y_true_test)),
    },
}

with open(metrics_path, "w") as f:
    json.dump(metrics_payload, f, indent=2)

print(f"Saved TFT metrics to {metrics_path}")



[TFT] Epoch 01 | Train BCE loss: 0.2693
[TFT] Epoch 02 | Train BCE loss: 0.2530
[TFT] Epoch 03 | Train BCE loss: 0.2513
[TFT] Epoch 04 | Train BCE loss: 0.2495
[TFT] Epoch 05 | Train BCE loss: 0.2486
[TFT] Epoch 06 | Train BCE loss: 0.2489
[TFT] Epoch 07 | Train BCE loss: 0.2467
[TFT] Epoch 08 | Train BCE loss: 0.2470
[TFT] Epoch 09 | Train BCE loss: 0.2475
[TFT] Epoch 10 | Train BCE loss: 0.2476

=== TFT Train Metrics (Single-Leg + Parlay) ===
AUC_train   : 0.8315
ECE_train   : 0.0299
PaCE2_train : 0.0201  (random 2-leg parlays)

=== TFT Test Metrics (Single-Leg + Parlay) ===
AUC_test   : 0.8075
ECE_test   : 0.0315
PaCE2_test : 0.0267  (random 2-leg parlays)
Saved TFT model to models/tft_dual_receiving_yds_line_80.0_past5_dmodel128.pt
Saved TFT metrics to metrics/tft_dual_receiving_yds_line_80.0_past5_dmodel128_metrics.json


## TFT TEST for single leg probabilities

In [17]:
player_name = "George Kittle"

prob = predict_player_over_prob(
    model=model,
    df=test_df,
    player_name=player_name,
    stat_col=STAT_COL,
    line_value=LINE_VALUE,
    n_past_games=N_PAST_GAMES,
    model_type="tft"
)

print(f"\n[TFT] Player: {player_name}")
print(f"Prop: {STAT_COL} over {LINE_VALUE}")
print(f"Predicted probability (model): {prob:.3f}")


[TFT] Player: George Kittle
Prop: YDS over 80
Predicted probability (model): 0.067


## TFT TEST for multi leg probabilities

In [18]:
yard_type = "Receiving" 

train_df, test_df, full_df = load_data(yard_type)

In [19]:
parlay_legs = [
    {
        "player": "George Kittle",
        "STAT_COL": "YDS",
        "line_value": 55.5,
    },
    {
        "player": "Brandon Aiyuk",
        "STAT_COL": "YDS",
        "line_value": 60.5,
    },
]

In [None]:
parlay_model_choice = "TFT" 

parlay_prob, leg_probs = compute_parlay_prob(
    parlay_legs=parlay_legs,
    yard_type=yard_type,
    parlay_model_choice=parlay_model_choice,
    train_df=train_df,
    test_df=test_df,
    full_df=full_df,
)

print(f"\nParlay model family: {parlay_model_choice}")
for leg, p in leg_probs:
    print(
        f"Leg: {leg['player']} – {leg['STAT_COL']} > {leg['line_value']} "
        f"→ P(hit) = {p:.3f}"
    )

print(f"\nP(all legs hit) = {parlay_prob:.3f}")

KeyError: 'stat_col'

# XGBoost TEST

In [None]:
train_df_labeled = add_over_under_label(
    df=train_df,
    stat_col=STAT_COL,
    line_value=LINE_VALUE,
    new_col="over_label",
)

test_df_labeled = add_over_under_label(
    df=test_df,
    stat_col=STAT_COL,
    line_value=LINE_VALUE,
    new_col="over_label",
)

X_train, y_train, lengths_train, meta_train = prepare_receiving_sequences(
    train_df_labeled,
    n_past_games=N_PAST_GAMES,
    target_col="over_label",
)

X_test, y_test, lengths_test, meta_test = prepare_receiving_sequences(
    test_df_labeled,
    n_past_games=N_PAST_GAMES,
    target_col="over_label",
)


xgb_cfg = XGBTrainConfig(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    verbose=False,
)

xgb_result = train_xgb_classifier(
    X=X_train,
    y=y_train,
    lengths=lengths_train,
    cfg=xgb_cfg,
)

xgb_model   = xgb_result["model"]
xgb_history = xgb_result["history"]


X_train_flat = flatten_sequences(X_train)

y_train_np = np.asarray(y_train, dtype=float)
train_probs = xgb_model.predict_proba(X_train_flat)[:, 1]

auc_train   = roc_auc_score(y_train_np, train_probs)
ece_train   = compute_ece(y_train_np, train_probs)
pace2_train = compute_pace(y_train_np, train_probs, L=2)

print("\n=== XGB Train Metrics (Single-Leg + Parlay) ===")
print(f"AUC_train   : {auc_train:.4f}")
print(f"ECE_train   : {ece_train:.4f}")
print(f"PaCE2_train : {pace2_train:.4f}  (random 2-leg parlays)")


# =======================================================
# 5. TEST METRICS
# =======================================================
X_test_flat = flatten_sequences(X_test)

y_test_np = np.asarray(y_test, dtype=float)
test_probs = xgb_model.predict_proba(X_test_flat)[:, 1]

auc_test   = roc_auc_score(y_test_np, test_probs)
ece_test   = compute_ece(y_test_np, test_probs)
pace2_test = compute_pace(y_test_np, test_probs, L=2)

print("\n=== XGB Test Metrics (Single-Leg + Parlay) ===")
print(f"AUC_test   : {auc_test:.4f}")
print(f"ECE_test   : {ece_test:.4f}")
print(f"PaCE2_test : {pace2_test:.4f}  (random 2-leg parlays)")


# =======================================================
# 6. SAVE MODEL + METRICS
# =======================================================
os.makedirs("models", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

model_tag = f"xgb_dual_receiving_{STAT_COL.lower()}_line_{LINE_VALUE:.1f}_past{N_PAST_GAMES}"

model_path   = os.path.join("models",  model_tag + ".json")
metrics_path = os.path.join("metrics", model_tag + "_metrics.json")

xgb_model.save_model(model_path)
print(f"Saved XGB model to {model_path}")

metrics_payload = {
    "timestamp": datetime.now().isoformat(),
    "stat_col": STAT_COL,
    "line_value": LINE_VALUE,
    "n_past_games": N_PAST_GAMES,

    "xgb_cfg": {
        "n_estimators": xgb_cfg.n_estimators,
        "max_depth": xgb_cfg.max_depth,
        "learning_rate": xgb_cfg.learning_rate,
        "subsample": xgb_cfg.subsample,
        "colsample_bytree": xgb_cfg.colsample_bytree,
        "reg_lambda": xgb_cfg.reg_lambda,
        "reg_alpha": xgb_cfg.reg_alpha,
        "eval_metric": xgb_cfg.eval_metric,
    },

    "train_history": xgb_history,

    "train_metrics": {
        "auc": float(auc_train),
        "ece": float(ece_train),
        "pace2": float(pace2_train),
        "n_train": int(len(y_train_np)),
    },

    "test_metrics": {
        "auc": float(auc_test),
        "ece": float(ece_test),
        "pace2": float(pace2_test),
        "n_test": int(len(y_test_np)),
    },
}

with open(metrics_path, "w") as f:
    json.dump(metrics_payload, f, indent=2)

print(f"Saved XGB metrics to {metrics_path}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGB Train Metrics (Single-Leg + Parlay) ===
AUC_train   : 0.9147
ECE_train   : 0.0334
PaCE2_train : 0.0157  (random 2-leg parlays)

=== XGB Test Metrics (Single-Leg + Parlay) ===
AUC_test   : 0.8090
ECE_test   : 0.0094
PaCE2_test : 0.0170  (random 2-leg parlays)
Saved XGB model to models/xgb_dual_receiving_yds_line_80.0_past5.json
Saved XGB metrics to metrics/xgb_dual_receiving_yds_line_80.0_past5_metrics.json


In [None]:
player_name = "Amon-Ra St. Brown"

prob_xgb = predict_player_over_prob(
    model=xgb_model,
    df=test_df,
    player_name=player_name,
    stat_col=STAT_COL,       
    line_value=LINE_VALUE,    
    n_past_games=N_PAST_GAMES,
    model_type="xgboost",   
)

print(f"[XGBoost] Player: {player_name}")
print(f"Prop: {STAT_COL} over {LINE_VALUE}")
print(f"Predicted probability (model): {prob_xgb:.3f}")

[XGBoost] Player: Amon-Ra St. Brown
Prop: YDS over 80
Predicted probability (model): 0.409


## XGB TEST for multi leg probabilities

In [12]:
yard_type = "Receiving" 

train_df, test_df, full_df = load_data(yard_type)

In [13]:
parlay_legs = [
    {
        "player": "George Kittle",
        "stat_col": "YDS",
        "line_value": 55.5,
    },
    {
        "player": "Brandon Aiyuk",
        "stat_col": "YDS",
        "line_value": 60.5,
    },
]

In [14]:
parlay_model_choice = "XGBoost" 

parlay_prob, leg_probs = compute_parlay_prob(
    parlay_legs=parlay_legs,
    yard_type=yard_type,
    parlay_model_choice=parlay_model_choice,
    train_df=train_df,
    test_df=test_df,
    full_df=full_df,
)

print(f"\nParlay model family: {parlay_model_choice}")
for leg, p in leg_probs:
    print(
        f"Leg: {leg['player']} – {leg['stat_col']} > {leg['line_value']} "
        f"→ P(hit) = {p:.3f}"
    )

print(f"\nP(all legs hit) = {parlay_prob:.3f}")


Parlay model family: XGBoost
Leg: George Kittle – YDS > 55.5 → P(hit) = 0.090
Leg: Brandon Aiyuk – YDS > 60.5 → P(hit) = 0.497

P(all legs hit) = 0.045
