In [1]:
# 1. Imports & config

import os
import math
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [2]:

DATA_DIR = "./data/csv"

# Hyperparameters
SEQ_LEN = 20              # number of past games per team
BATCH_SIZE = 64
HIDDEN_SIZE = 64
NUM_LAYERS = 3
LR = 1e-4
EPOCHS = 40
ERA_START = pd.to_datetime("2010-10-01")
VAL_SPLIT_DATE = "2021-10-01"
TEST_SPLIT_DATE = "2022-10-01"
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [3]:
files = [
    "common_player_info.csv",
    "game_info.csv",
    "officials.csv",
    "team.csv",
    "draft_combine_stats.csv",
    "game_summary.csv",
    "other_stats.csv",
    "team_details.csv",
    "draft_history.csv",
    "inactive_players.csv",
    "play_by_play.csv",
    "team_history.csv",
    "game.csv",
    "line_score.csv",
    "player.csv",
    "team_info_common.csv",
]

for fname in files:
    path = os.path.join(DATA_DIR, fname)
    df = pd.read_csv(path)
    print(f"\n=== {fname} ===")
    print("shape:", df.shape)
    print("columns:", list(df.columns)[:15], "...")


=== common_player_info.csv ===
shape: (4171, 33)
columns: ['person_id', 'first_name', 'last_name', 'display_first_last', 'display_last_comma_first', 'display_fi_last', 'player_slug', 'birthdate', 'school', 'country', 'last_affiliation', 'height', 'weight', 'season_exp', 'jersey'] ...

=== game_info.csv ===
shape: (58053, 4)
columns: ['game_id', 'game_date', 'attendance', 'game_time'] ...

=== officials.csv ===
shape: (70971, 5)
columns: ['game_id', 'official_id', 'first_name', 'last_name', 'jersey_num'] ...

=== team.csv ===
shape: (30, 7)
columns: ['id', 'full_name', 'abbreviation', 'nickname', 'city', 'state', 'year_founded'] ...

=== draft_combine_stats.csv ===
shape: (1202, 47)
columns: ['season', 'player_id', 'first_name', 'last_name', 'player_name', 'position', 'height_wo_shoes', 'height_wo_shoes_ft_in', 'height_w_shoes', 'height_w_shoes_ft_in', 'weight', 'wingspan', 'wingspan_ft_in', 'standing_reach', 'standing_reach_ft_in'] ...

=== game_summary.csv ===
shape: (58110, 14)
colu

In [4]:
# Core tables for modeling
games = pd.read_csv(os.path.join(DATA_DIR, "game.csv"))
game_info = pd.read_csv(os.path.join(DATA_DIR, "game_info.csv"))
other_stats = pd.read_csv(os.path.join(DATA_DIR, "other_stats.csv"))

print("games:", games.shape)
print("game_info:", game_info.shape)
print("other_stats:", other_stats.shape)

games: (65698, 55)
game_info: (58053, 4)
other_stats: (28271, 26)


In [5]:
# Column config for our pipeline
GAME_ID_COL = "game_id"
GAME_DATE_COL = "game_date"
HOME_TEAM_COL = "team_id_home"
AWAY_TEAM_COL = "team_id_away"
PTS_HOME_COL = "pts_home"
PTS_AWAY_COL = "pts_away"

# Make sure game_date is datetime
games[GAME_DATE_COL] = pd.to_datetime(games[GAME_DATE_COL])
game_info["game_date"] = pd.to_datetime(game_info["game_date"])

# Keep only modern-era games
mask_games = games[GAME_DATE_COL] >= ERA_START
games = games.loc[mask_games].reset_index(drop=True)

# Match game_info to the same window
mask_info = game_info["game_date"] >= ERA_START
game_info = game_info.loc[mask_info].reset_index(drop=True)

In [6]:
# 1) Select home/away feature columns, EXCLUDING the team_id columns
home_feature_cols = [
    c for c in games.columns
    if c.endswith("_home") and c != HOME_TEAM_COL
]

away_feature_cols = [
    c for c in games.columns
    if c.endswith("_away") and c != AWAY_TEAM_COL
]

print("Num home_feature_cols:", len(home_feature_cols))
print("Num away_feature_cols:", len(away_feature_cols))

# 2) Home rows
home_df = games[[GAME_ID_COL, GAME_DATE_COL, HOME_TEAM_COL] + home_feature_cols].copy()
home_df = home_df.rename(columns={HOME_TEAM_COL: "team_id"})
home_df["is_home"] = 1

for col in home_feature_cols:
    base = col.replace("_home", "")
    home_df[base] = home_df[col]

home_df["y_points"] = home_df[PTS_HOME_COL]

# 3) Away rows
away_df = games[[GAME_ID_COL, GAME_DATE_COL, AWAY_TEAM_COL] + away_feature_cols].copy()
away_df = away_df.rename(columns={AWAY_TEAM_COL: "team_id"})
away_df["is_home"] = 0

for col in away_feature_cols:
    base = col.replace("_away", "")
    away_df[base] = away_df[col]

away_df["y_points"] = away_df[PTS_AWAY_COL]

# 4) Keep only unified columns
keep_cols = [GAME_ID_COL, GAME_DATE_COL, "team_id", "is_home", "y_points"]
base_feature_names = sorted(
    {c.replace("_home", "").replace("_away", "") for c in home_feature_cols + away_feature_cols}
)
keep_cols += base_feature_names

home_df = home_df[keep_cols].copy()
away_df = away_df[keep_cols].copy()

# 5) Combine into team_games
team_games = pd.concat([home_df, away_df], axis=0).reset_index(drop=True)
team_games = team_games.sort_values(["team_id", GAME_DATE_COL]).reset_index(drop=True)

print("team_games initial:", team_games.shape)
team_games.head()

Num home_feature_cols: 24
Num away_feature_cols: 24
team_games initial: (32560, 29)


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,pf,plus_minus,pts,reb,stl,team_abbreviation,team_name,tov,video_available,wl
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,20.0,-27,80.0,36.0,7.0,MTA,Tel Aviv Maccabi Electra,9.0,0,L
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,27.0,-17,94.0,43.0,4.0,MTA,Tel Aviv Maccabi Electra,21.0,0,L
2,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,30.0,-38,70.0,37.0,10.0,MAC,Haifa Maccabi Haifa,21.0,0,L
3,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,22.0,-8,100.0,38.0,9.0,MAC,Haifa Maccabi Haifa,14.0,0,L
4,11200056,2012-10-16,93,0,81.0,17.0,4.0,33.0,0.37,27.0,...,30.0,-33,81.0,45.0,9.0,MAC,Haifa Maccabi Haifa,26.0,0,L


In [7]:
# --- Tier 2: merge game_info (attendance + game_hour) ---

# Parse game_time to hour-of-day
game_info["game_hour"] = pd.to_datetime(
    game_info["game_time"],
    format="%I:%M %p",
    errors="coerce"
).dt.hour

# Keep only what we need
game_info_small = game_info[[GAME_ID_COL, "attendance", "game_hour"]].copy()

print("game_info_small sample:")
print(game_info_small.head())

# Merge into team_games
team_games = team_games.merge(
    game_info_small,
    on=GAME_ID_COL,
    how="left"
)

print("team_games after game_info:", team_games.shape)
team_games.head()


game_info_small sample:
    game_id  attendance  game_hour
0  21000003     18997.0        NaN
1  21000002     20603.0        NaN
2  21000001     18624.0        NaN
3  21000015     18428.0        NaN
4  21000010     15039.0        NaN
team_games after game_info: (32584, 31)


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,pts,reb,stl,team_abbreviation,team_name,tov,video_available,wl,attendance,game_hour
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,80.0,36.0,7.0,MTA,Tel Aviv Maccabi Electra,9.0,0,L,20562.0,
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,94.0,43.0,4.0,MTA,Tel Aviv Maccabi Electra,21.0,0,L,15915.0,
2,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,70.0,37.0,10.0,MAC,Haifa Maccabi Haifa,21.0,0,L,5174.0,
3,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,100.0,38.0,9.0,MAC,Haifa Maccabi Haifa,14.0,0,L,,
4,11200056,2012-10-16,93,0,81.0,17.0,4.0,33.0,0.37,27.0,...,81.0,45.0,9.0,MAC,Haifa Maccabi Haifa,26.0,0,L,11192.0,


In [8]:
# --- Tier 3: merge other_stats (advanced team stats) ---

print("other_stats sample:")
print(other_stats.head())

# Game-level columns that apply to the whole game
game_level_cols = []
for col in ["lead_changes", "times_tied"]:
    if col in other_stats.columns:
        game_level_cols.append(col)

# Home/away advanced stat columns (excluding id/label columns)
home_stat_cols = [
    c for c in other_stats.columns
    if c.endswith("_home")
    and not c.startswith(("team_id_", "team_abbreviation_", "team_city_"))
]

away_stat_cols = [
    c for c in other_stats.columns
    if c.endswith("_away")
    and not c.startswith(("team_id_", "team_abbreviation_", "team_city_"))
]

print("home_stat_cols:", home_stat_cols)
print("away_stat_cols:", away_stat_cols)
print("game_level_cols:", game_level_cols)

# 4.1 Home advanced stats â†’ unified format
home_adv = other_stats[["game_id", "team_id_home"] + game_level_cols + home_stat_cols].copy()
home_adv = home_adv.rename(columns={"team_id_home": "team_id"})

for col in home_stat_cols:
    base = col.replace("_home", "")
    home_adv[base] = home_adv[col]

home_keep_cols = ["game_id", "team_id"] + game_level_cols + [c.replace("_home", "") for c in home_stat_cols]
home_adv = home_adv[home_keep_cols]

# 4.2 Away advanced stats â†’ unified format
away_adv = other_stats[["game_id", "team_id_away"] + game_level_cols + away_stat_cols].copy()
away_adv = away_adv.rename(columns={"team_id_away": "team_id"})

for col in away_stat_cols:
    base = col.replace("_away", "")
    away_adv[base] = away_adv[col]

away_keep_cols = ["game_id", "team_id"] + game_level_cols + [c.replace("_away", "") for c in away_stat_cols]
away_adv = away_adv[away_keep_cols]

print("home_adv shape:", home_adv.shape)
print("away_adv shape:", away_adv.shape)

# 4.3 Combine advanced stats
adv_long = pd.concat([home_adv, away_adv], axis=0).reset_index(drop=True)
print("adv_long shape:", adv_long.shape)
print(adv_long.head())

# 4.4 Merge advanced stats into team_games
team_games = team_games.merge(
    adv_long,
    on=["game_id", "team_id"],
    how="left"
)

print("team_games after other_stats:", team_games.shape)
team_games.head()


other_stats sample:
    game_id  league_id  team_id_home team_abbreviation_home team_city_home  \
0  29600012          0    1610612756                    PHX        Phoenix   
1  29600005          0    1610612737                    ATL        Atlanta   
2  29600002          0    1610612739                    CLE      Cleveland   
3  29600007          0    1610612754                    IND        Indiana   
4  29600013          0    1610612746                    LAC    Los Angeles   

   pts_paint_home  pts_2nd_chance_home  pts_fb_home  largest_lead_home  \
0              44                   18            2                  1   
1              32                    9            6                  0   
2              36                   14            6                 20   
3              34                   11            4                 10   
4              40                   19            2                 12   

   lead_changes  ...  team_abbreviation_away  team_city_away  pts_

Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,lead_changes,times_tied,pts_paint,pts_2nd_chance,pts_fb,largest_lead,team_turnovers,total_turnovers,team_rebounds,pts_off_to
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,1.0,3.0,32.0,12.0,3.0,3.0,0.0,9.0,5.0,8.0
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,0.0,0.0,54.0,15.0,12.0,0.0,1.0,21.0,7.0,28.0
2,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,3.0,1.0,28.0,8.0,17.0,3.0,0.0,21.0,10.0,29.0
3,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,,,,,,,,,,
4,11200056,2012-10-16,93,0,81.0,17.0,4.0,33.0,0.37,27.0,...,2.0,2.0,30.0,19.0,4.0,3.0,0.0,26.0,7.0,35.0


In [9]:
# --- Compute SEQ_FEATURES and scale ---

# --- Schedule features: rest / B2B / 3-in-4 / 4-in-6 ---

# Ensure sorted by team + date
team_games = team_games.sort_values(["team_id", GAME_DATE_COL]).reset_index(drop=True)

grouped = team_games.groupby("team_id")

# Previous game dates
team_games["prev_date"]  = grouped[GAME_DATE_COL].shift(1)
team_games["prev3_date"] = grouped[GAME_DATE_COL].shift(3)
team_games["prev4_date"] = grouped[GAME_DATE_COL].shift(4)

# Days of rest since last game
team_games["days_rest"] = (team_games[GAME_DATE_COL] - team_games["prev_date"]).dt.days

# Schedule intensity flags
team_games["is_b2b"]  = (team_games["days_rest"] == 1).astype(int)

team_games["is_3in4"] = (
    (team_games[GAME_DATE_COL] - team_games["prev3_date"]).dt.days <= 4
).astype(int)

team_games["is_4in6"] = (
    (team_games[GAME_DATE_COL] - team_games["prev4_date"]).dt.days <= 6
).astype(int)



# --- Team ID â†” index mapping for embeddings ---

team_ids = sorted(team_games["team_id"].unique())
team_id_to_idx = {tid: i for i, tid in enumerate(team_ids)}
num_teams = len(team_ids)

# Optional: store per-row team index (not used in SEQ_FEATURES)
team_games["team_idx"] = team_games["team_id"].map(team_id_to_idx)



exclude_cols = {
    GAME_ID_COL,
    GAME_DATE_COL,
    "team_id",
    "y_points",
    "prev_date",
    "prev3_date",
    "prev4_date",
    "team_idx",   # NEW: don't treat this as numeric feature
}




numeric_cols = [
    c for c in team_games.columns
    if c not in exclude_cols and pd.api.types.is_numeric_dtype(team_games[c])
]

SEQ_FEATURES = numeric_cols
print("Number of sequence features:", len(SEQ_FEATURES))
print("First 30 SEQ_FEATURES:", SEQ_FEATURES[:30])

train_rows = team_games[team_games[GAME_DATE_COL] < VAL_SPLIT_DATE].copy()

scaler = StandardScaler()
scaler.fit(train_rows[SEQ_FEATURES].fillna(0.0))

team_games[SEQ_FEATURES] = scaler.transform(
    team_games[SEQ_FEATURES].fillna(0.0)
)

team_games.head()


Number of sequence features: 37
First 30 SEQ_FEATURES: ['is_home', 'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm', 'ft_pct', 'fta', 'ftm', 'oreb', 'pf', 'plus_minus', 'pts', 'reb', 'stl', 'tov', 'video_available', 'attendance', 'game_hour', 'lead_changes', 'times_tied', 'pts_paint', 'pts_2nd_chance', 'pts_fb', 'largest_lead', 'team_turnovers']


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,team_rebounds,pts_off_to,prev_date,prev3_date,prev4_date,days_rest,is_b2b,is_3in4,is_4in6,team_idx
0,11400003,2014-10-05,41,-1.0,80.0,-0.693698,0.849701,-0.903462,-0.953262,0.554806,...,-0.501082,-0.738944,NaT,NaT,NaT,-0.173476,-0.485414,-0.14156,-0.217296,0
1,11400011,2014-10-07,41,-1.0,94.0,-1.447486,0.057144,-0.180799,-1.222245,-0.002937,...,-0.015788,1.702937,2014-10-05,NaT,NaT,-0.104295,-0.485414,-0.14156,-0.217296,0
2,11000002,2010-10-03,93,-1.0,70.0,-1.635933,-1.131691,-1.445459,-0.375448,-0.783777,...,0.712153,1.825031,NaT,NaT,NaT,-0.173476,-0.485414,-0.14156,-0.217296,1
3,11200029,2012-10-11,93,-1.0,100.0,0.06009,0.057144,-1.264794,0.272103,-0.56068,...,-1.714317,-1.715697,2010-10-03,NaT,NaT,25.388655,-0.485414,-0.14156,-0.217296,1
4,11200056,2012-10-16,93,-1.0,81.0,-1.070592,-0.339134,-0.000133,0.162517,0.108612,...,-0.015788,2.557596,2012-10-11,NaT,NaT,-0.000525,-0.485414,-0.14156,-0.217296,1


In [10]:
team_sequences = []
team_targets = []
team_meta = []  # (game_id, team_id, game_date)

for team_id, group in team_games.groupby("team_id"):
    group = group.sort_values(GAME_DATE_COL).reset_index(drop=True)

    feats = group[SEQ_FEATURES].values           # [num_games, F]
    targets = group["y_points"].values
    game_ids = group[GAME_ID_COL].values
    dates = group[GAME_DATE_COL].values

    # require SEQ_LEN previous games
    for i in range(SEQ_LEN, len(group)):
        seq = feats[i-SEQ_LEN:i]
        y = targets[i]
        gid = game_ids[i]
        date = dates[i]

        team_sequences.append(seq)
        team_targets.append(y)
        team_meta.append((gid, team_id, date))

team_sequences = np.stack(team_sequences)          # [N_team_games, T, F]
team_targets = np.array(team_targets, dtype=np.float32)

print("team_sequences:", team_sequences.shape)
print("team_targets:", team_targets.shape)


team_sequences: (31929, 20, 37)
team_targets: (31929,)


In [11]:
seq_index_by_game_team = {
    (gid, tid): idx
    for idx, (gid, tid, date) in enumerate(team_meta)
}

len(seq_index_by_game_team)


31871

In [12]:
games_full = games[[GAME_ID_COL, GAME_DATE_COL, HOME_TEAM_COL, AWAY_TEAM_COL, PTS_HOME_COL, PTS_AWAY_COL]].copy()

games_full = games_full.rename(columns={
    HOME_TEAM_COL: "home_team_id",
    AWAY_TEAM_COL: "away_team_id",
    PTS_HOME_COL: "y_home",
    PTS_AWAY_COL: "y_away"
})

print("games_full:", games_full.shape)
games_full.head()


X_home = []
X_away = []
Y = []
GAME_DATES = []

HOME_TEAM_IDX = []
AWAY_TEAM_IDX = []



games_full: (16280, 6)


In [13]:
X_home = []
X_away = []
Y = []
GAME_DATES = []

for _, row in games_full.iterrows():
    gid = row[GAME_ID_COL]
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]
    date = row[GAME_DATE_COL]

    key_home = (gid, home_id)
    key_away = (gid, away_id)

    if key_home not in seq_index_by_game_team or key_away not in seq_index_by_game_team:
        continue  # skip early games

    idx_h = seq_index_by_game_team[key_home]
    idx_a = seq_index_by_game_team[key_away]

    X_home.append(team_sequences[idx_h])
    X_away.append(team_sequences[idx_a])
    
    # Build as margin instead of abs
    home = row["y_home"]
    away = row["y_away"]

    margin = home - away
    total  = home + away

    Y.append([margin, total])
    
    GAME_DATES.append(date)
    
    HOME_TEAM_IDX.append(team_id_to_idx[home_id])
    AWAY_TEAM_IDX.append(team_id_to_idx[away_id])

X_home = np.stack(X_home)
X_away = np.stack(X_away)
Y = np.array(Y, dtype=np.float32)
GAME_DATES = np.array(GAME_DATES)

HOME_TEAM_IDX = np.array(HOME_TEAM_IDX, dtype=np.int64)
AWAY_TEAM_IDX = np.array(AWAY_TEAM_IDX, dtype=np.int64)

print("Final dataset shapes:")
print("X_home:", X_home.shape)
print("X_away:", X_away.shape)
print("Y:", Y.shape)
print("home team idx", HOME_TEAM_IDX)
print("away team idx", AWAY_TEAM_IDX)


Final dataset shapes:
X_home: (15908, 20, 37)
X_away: (15908, 20, 37)
Y: (15908, 2)
home team idx [34 27 30 ... 27 52 52]
away team idx [31 35 36 ... 32 51 51]


In [14]:
VAL_SPLIT_DATE = pd.to_datetime(VAL_SPLIT_DATE)
TEST_SPLIT_DATE = pd.to_datetime(TEST_SPLIT_DATE)

dates = pd.to_datetime(GAME_DATES)

train_mask = dates < VAL_SPLIT_DATE
val_mask = (dates >= VAL_SPLIT_DATE) & (dates < TEST_SPLIT_DATE)
test_mask = dates >= TEST_SPLIT_DATE

def split(arr):
    return arr[train_mask], arr[val_mask], arr[test_mask]

X_home_train, X_home_val, X_home_test = split(X_home)
X_away_train, X_away_val, X_away_test = split(X_away)
Y_train, Y_val, Y_test = split(Y)

home_idx_train, home_idx_val, home_idx_test = split(HOME_TEAM_IDX)
away_idx_train, away_idx_val, away_idx_test = split(AWAY_TEAM_IDX)

print("Train:", len(Y_train), "Val:", len(Y_val), "Test:", len(Y_test))


# --- NEW: keep raw targets and create a scaler for [margin, total] ---
from sklearn.preprocessing import StandardScaler

train_mean_scores = Y_train.mean(axis=0)   # still raw here!
print("Train mean scores (margin, total):", train_mean_scores)

Y_train_raw = Y_train.copy()
Y_val_raw   = Y_val.copy()
Y_test_raw  = Y_test.copy()

y_scaler = StandardScaler()
y_scaler.fit(Y_train_raw)        # fit on train only

Y_train = y_scaler.transform(Y_train_raw)
Y_val   = y_scaler.transform(Y_val_raw)
Y_test  = y_scaler.transform(Y_test_raw)


Train: 13143 Val: 1385 Test: 1380
Train mean scores (margin, total): [  2.5303204 208.5638   ]


In [15]:
class GameSequenceDataset(Dataset):
    def __init__(self, x_home, x_away, y, home_idx, away_idx):
        self.x_home = torch.tensor(x_home, dtype=torch.float32)
        self.x_away = torch.tensor(x_away, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

        self.home_idx = torch.tensor(home_idx, dtype=torch.long)
        self.away_idx = torch.tensor(away_idx, dtype=torch.long)

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return (
            self.x_home[idx],
            self.x_away[idx],
            self.y[idx],
            self.home_idx[idx],
            self.away_idx[idx],
        )


train_dataset = GameSequenceDataset(
    X_home_train, X_away_train, Y_train, home_idx_train, away_idx_train
)
val_dataset = GameSequenceDataset(
    X_home_val, X_away_val, Y_val, home_idx_val, away_idx_val
)
test_dataset = GameSequenceDataset(
    X_home_test, X_away_test, Y_test, home_idx_test, away_idx_test
)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [16]:
class TeamSequenceEncoder(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, dropout: float = 0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )

    def forward(self, x):
        # x: [B, T, F]
        output, (h_n, c_n) = self.lstm(x)
        # output: [B, T, 2H]
        return output
    
class ScorePredictorMLP(nn.Module):
    """
    Model A â€” LSTM Encoder + Mean Pooling + Team Embeddings + MLP.
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int = 128,
        num_layers: int = 1,
        num_teams: int = None,
        team_emb_dim: int = 16,
    ):
        super().__init__()

        self.embed_dim = hidden_size * 2  # BiLSTM output size
        self.team_emb_dim = team_emb_dim

        # Shared sequence encoder
        self.encoder = TeamSequenceEncoder(input_size, hidden_size, num_layers)

        # NEW: team embedding table
        self.team_embedding = nn.Embedding(num_teams, team_emb_dim)

        pair_input_dim = self.embed_dim * 2 + team_emb_dim * 2  # home/away seq + home/away team emb

        # MLP prediction head (margin, total)
        self.mlp = nn.Sequential(
            nn.Linear(pair_input_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, 2),   # [margin, total]
        )

    def forward(self, x_home, x_away, home_team_idx, away_team_idx):
        """
        x_home, x_away: [B, T, F]
        home_team_idx, away_team_idx: [B] (LongTensor)
        """
        # Encode sequences
        h_home_seq = self.encoder(x_home)  # [B, T, 2H]
        h_away_seq = self.encoder(x_away)  # [B, T, 2H]

        # Temporal mean pooling
        home_vec = h_home_seq.mean(dim=1)  # [B, 2H]
        away_vec = h_away_seq.mean(dim=1)  # [B, 2H]

        # Team identity embeddings
        home_emb = self.team_embedding(home_team_idx)  # [B, D]
        away_emb = self.team_embedding(away_team_idx)  # [B, D]

        # Concatenate all
        pair_vec = torch.cat([home_vec, away_vec, home_emb, away_emb], dim=-1)
        y_pred = self.mlp(pair_vec)  # [B, 2]

        return y_pred



class ScorePredictorCrossAttention(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int = 128,
        num_layers: int = 1,
        num_heads: int = 4,
        num_teams: int = None,
        team_emb_dim: int = 16,
    ):
        super().__init__()
        self.embed_dim = hidden_size * 2  # BiLSTM
        self.team_emb_dim = team_emb_dim

        self.encoder = TeamSequenceEncoder(input_size, hidden_size, num_layers)

        self.cross_attn = nn.MultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=num_heads,
            batch_first=True,
        )

        self.team_embedding = nn.Embedding(num_teams, team_emb_dim)

        pair_input_dim = self.embed_dim * 2 + team_emb_dim * 2

        self.mlp = nn.Sequential(
            nn.Linear(pair_input_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, 2),  # [margin, total]
        )

    def forward(self, x_home, x_away, home_team_idx, away_team_idx):
        # Encode sequences
        h_home_seq = self.encoder(x_home)   # [B, T, 2H]
        h_away_seq = self.encoder(x_away)   # [B, T, 2H]

        # Home attends to away
        home_ctx, _ = self.cross_attn(
            query=h_home_seq,
            key=h_away_seq,
            value=h_away_seq,
        )

        # Away attends to home
        away_ctx, _ = self.cross_attn(
            query=h_away_seq,
            key=h_home_seq,
            value=h_home_seq,
        )

        # Pool over time
        home_vec = home_ctx.mean(dim=1)   # [B, 2H]
        away_vec = away_ctx.mean(dim=1)   # [B, 2H]

        # Team embeddings
        home_emb = self.team_embedding(home_team_idx)  # [B, D]
        away_emb = self.team_embedding(away_team_idx)  # [B, D]

        pair_vec = torch.cat([home_vec, away_vec, home_emb, away_emb], dim=-1)
        y_pred = self.mlp(pair_vec)
        return y_pred



In [17]:
input_size = len(SEQ_FEATURES)

model_a = ScorePredictorMLP(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_teams=num_teams,
    team_emb_dim=16,
).to(device)

model_b = ScorePredictorCrossAttention(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_heads=4,
    num_teams=num_teams,
    team_emb_dim=16,
).to(device)



print(model_a)
print(model_b)


ScorePredictorMLP(
  (encoder): TeamSequenceEncoder(
    (lstm): LSTM(37, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (team_embedding): Embedding(53, 16)
  (mlp): Sequential(
    (0): Linear(in_features=288, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)
ScorePredictorCrossAttention(
  (encoder): TeamSequenceEncoder(
    (lstm): LSTM(37, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (cross_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (team_embedding): Embedding(53, 16)
  (mlp): Sequential(
    (0): Linear(in_features=288, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)


In [18]:
def run_epoch(loader, train: bool = True, model=None):
    if model is None:
        raise RuntimeError("model not set")

    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_true = []
    all_pred = []

    for x_home, x_away, y, home_idx, away_idx in loader:
        x_home = x_home.to(device)
        x_away = x_away.to(device)
        y = y.to(device)  # scaled targets
        home_idx = home_idx.to(device)
        away_idx = away_idx.to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            y_pred = model(x_home, x_away, home_idx, away_idx)
            loss = criterion(y_pred, y)

            if train:
                loss.backward()
                optimizer.step()

        total_loss += loss.item() * y.size(0)
        all_true.append(y.detach().cpu().numpy())
        all_pred.append(y_pred.detach().cpu().numpy())

    all_true = np.concatenate(all_true, axis=0)  # scaled
    all_pred = np.concatenate(all_pred, axis=0)  # scaled

    # --- NEW: convert back to real [margin, total] for metrics ---
    all_true_unscaled = y_scaler.inverse_transform(all_true)
    all_pred_unscaled = y_scaler.inverse_transform(all_pred)

    mae = mean_absolute_error(all_true_unscaled, all_pred_unscaled)
    rmse = math.sqrt(mean_squared_error(all_true_unscaled, all_pred_unscaled))
    avg_loss = total_loss / len(loader.dataset)

    return avg_loss, mae, rmse


In [19]:
criterion = nn.SmoothL1Loss()

optimizer = torch.optim.AdamW(
    model_a.parameters(),
    lr=LR,
    weight_decay=1e-4
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # lower RMSE is better
    factor=0.5,      # reduce LR by half
    patience=2,      # wait 2 epochs before dropping LR
)

best_val_rmse = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae, train_rmse = run_epoch(
        train_loader, train=True, model=model_a
    )
    val_loss, val_mae, val_rmse = run_epoch(
        val_loader, train=False, model=model_a
    )

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.3f}, MAE {train_mae:.3f}, RMSE {train_rmse:.3f} | "
        f"Val Loss {val_loss:.3f}, MAE {val_mae:.3f}, RMSE {val_rmse:.3f}"
    )

    # ðŸ”¥ IMPORTANT â†’ Notify the scheduler
    scheduler.step(val_rmse)  # or val_loss if you prefer loss

    # ðŸ”¥ Standard early stopping capture
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_state = model_a.state_dict()

Epoch 01 | Train Loss 0.390, MAE 13.687, RMSE 17.712 | Val Loss 0.416, MAE 13.976, RMSE 17.772
Epoch 02 | Train Loss 0.355, MAE 12.792, RMSE 16.492 | Val Loss 0.395, MAE 13.510, RMSE 17.296
Epoch 03 | Train Loss 0.338, MAE 12.404, RMSE 16.107 | Val Loss 0.388, MAE 13.321, RMSE 17.087
Epoch 04 | Train Loss 0.333, MAE 12.307, RMSE 15.952 | Val Loss 0.385, MAE 13.280, RMSE 17.026
Epoch 05 | Train Loss 0.332, MAE 12.281, RMSE 15.914 | Val Loss 0.384, MAE 13.256, RMSE 16.985
Epoch 06 | Train Loss 0.330, MAE 12.229, RMSE 15.861 | Val Loss 0.383, MAE 13.212, RMSE 16.930
Epoch 07 | Train Loss 0.328, MAE 12.195, RMSE 15.804 | Val Loss 0.382, MAE 13.190, RMSE 16.887
Epoch 08 | Train Loss 0.328, MAE 12.172, RMSE 15.778 | Val Loss 0.383, MAE 13.206, RMSE 16.921
Epoch 09 | Train Loss 0.325, MAE 12.117, RMSE 15.711 | Val Loss 0.384, MAE 13.238, RMSE 16.963
Epoch 10 | Train Loss 0.325, MAE 12.107, RMSE 15.686 | Val Loss 0.387, MAE 13.350, RMSE 17.092
Epoch 11 | Train Loss 0.323, MAE 12.056, RMSE 15.6

In [20]:
test_loss, test_mae, test_rmse = run_epoch(test_loader, train=False, model=model_a)
print(f"Test Loss {test_loss:.3f}, MAE {test_mae:.3f}, RMSE {test_rmse:.3f}")

Test Loss 0.353, MAE 12.668, RMSE 16.422


In [21]:
# TRAIN MODEL B
criterion = nn.SmoothL1Loss()

optimizer = torch.optim.AdamW(
    model_b.parameters(),
    lr=LR,
    weight_decay=1e-4
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # lower RMSE is better
    factor=0.5,      # reduce LR by half
    patience=2,      # wait 2 epochs before dropping LR
)

best_val_rmse = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae, train_rmse = run_epoch(
        train_loader, train=True, model=model_b
    )
    val_loss, val_mae, val_rmse = run_epoch(
        val_loader, train=False, model=model_b
    )

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.3f}, MAE {train_mae:.3f}, RMSE {train_rmse:.3f} | "
        f"Val Loss {val_loss:.3f}, MAE {val_mae:.3f}, RMSE {val_rmse:.3f}"
    )

    # ðŸ”¥ IMPORTANT â†’ Notify the scheduler
    scheduler.step(val_rmse)  # or val_loss if you prefer loss

    # ðŸ”¥ Standard early stopping capture
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_state = model_b.state_dict()

Epoch 01 | Train Loss 0.382, MAE 13.493, RMSE 17.481 | Val Loss 0.406, MAE 13.734, RMSE 17.549
Epoch 02 | Train Loss 0.355, MAE 12.786, RMSE 16.517 | Val Loss 0.402, MAE 13.643, RMSE 17.441
Epoch 03 | Train Loss 0.350, MAE 12.695, RMSE 16.424 | Val Loss 0.401, MAE 13.646, RMSE 17.439
Epoch 04 | Train Loss 0.338, MAE 12.430, RMSE 16.113 | Val Loss 0.385, MAE 13.279, RMSE 17.010
Epoch 05 | Train Loss 0.331, MAE 12.265, RMSE 15.915 | Val Loss 0.384, MAE 13.237, RMSE 16.937
Epoch 06 | Train Loss 0.329, MAE 12.218, RMSE 15.822 | Val Loss 0.382, MAE 13.195, RMSE 16.871
Epoch 07 | Train Loss 0.328, MAE 12.180, RMSE 15.764 | Val Loss 0.382, MAE 13.191, RMSE 16.871
Epoch 08 | Train Loss 0.327, MAE 12.168, RMSE 15.744 | Val Loss 0.384, MAE 13.245, RMSE 16.947
Epoch 09 | Train Loss 0.326, MAE 12.144, RMSE 15.708 | Val Loss 0.381, MAE 13.173, RMSE 16.842
Epoch 10 | Train Loss 0.326, MAE 12.130, RMSE 15.706 | Val Loss 0.381, MAE 13.169, RMSE 16.853
Epoch 11 | Train Loss 0.324, MAE 12.088, RMSE 15.6

In [22]:
test_loss, test_mae, test_rmse = run_epoch(test_loader, train=False, model=model_b)
print(f"Test Loss {test_loss:.3f}, MAE {test_mae:.3f}, RMSE {test_rmse:.3f}")


Test Loss 0.351, MAE 12.663, RMSE 16.391


In [23]:
# Using your train split


def evaluate_constant_baseline(Y_true, const_pred):
    const = np.tile(const_pred, (Y_true.shape[0], 1))
    mae = mean_absolute_error(Y_true, const)
    rmse = math.sqrt(mean_squared_error(Y_true, const))
    return mae, rmse

baseline_mae, baseline_rmse = evaluate_constant_baseline(Y_test_raw, train_mean_scores)
print(f"Constant baseline | MAE {baseline_mae:.3f}, RMSE {baseline_rmse:.3f}")



Constant baseline | MAE 16.971, RMSE 22.351


In [24]:
def get_predictions(model, loader):
    model.eval()
    all_true = []
    all_pred = []
    with torch.no_grad():
        for x_home, x_away, y, home_idx, away_idx in loader:
            x_home = x_home.to(device)
            x_away = x_away.to(device)
            y = y.to(device)  # scaled
            home_idx = home_idx.to(device)
            away_idx = away_idx.to(device)

            y_pred = model(x_home, x_away, home_idx, away_idx)
            all_true.append(y.cpu().numpy())
            all_pred.append(y_pred.cpu().numpy())

    all_true = np.concatenate(all_true, axis=0)  # scaled
    all_pred = np.concatenate(all_pred, axis=0)  # scaled

    # --- NEW: unscale before returning ---
    all_true_unscaled = y_scaler.inverse_transform(all_true)
    all_pred_unscaled = y_scaler.inverse_transform(all_pred)

    return all_true_unscaled, all_pred_unscaled


Y_true_test, Y_pred_a = get_predictions(model_a, test_loader)
_, Y_pred_b = get_predictions(model_b, test_loader)

# True margin/total
true_margin = Y_true_test[:, 0]
true_total  = Y_true_test[:, 1]

# Reconstruct true scores
true_home = (true_total + true_margin) / 2
true_away = (true_total - true_margin) / 2

# Model A margin/total
pred_margin_a = Y_pred_a[:, 0]
pred_total_a  = Y_pred_a[:, 1]
pred_home_a   = (pred_total_a + pred_margin_a) / 2
pred_away_a   = (pred_total_a - pred_margin_a) / 2

# Model B margin/total
pred_margin_b = Y_pred_b[:, 0]
pred_total_b  = Y_pred_b[:, 1]
pred_home_b   = (pred_total_b + pred_margin_b) / 2
pred_away_b   = (pred_total_b - pred_margin_b) / 2


def winner_accuracy(y_true, y_pred):
    true_margin = y_true[:, 0]
    pred_margin = y_pred[:, 0]
    return ((true_margin > 0) == (pred_margin > 0)).mean()

def margin_accuracy(y_true, y_pred):
    true_margin = y_true[:, 0]
    pred_margin = y_pred[:, 0]
    return (np.abs(true_margin - pred_margin) < 5).mean()

def totals_accuracy(y_true, y_pred):
    true_total = y_true[:, 1]
    pred_total = y_pred[:, 1]
    return (np.abs(true_total - pred_total) < 5).mean()


acc_a = winner_accuracy(Y_true_test, Y_pred_a)
acc_b = winner_accuracy(Y_true_test, Y_pred_b)
print(f"Model A winner accuracy: {acc_a:.3%}")
print(f"Model B winner accuracy: {acc_b:.3%}")

margin_a = margin_accuracy(Y_true_test, Y_pred_a)
margin_b = margin_accuracy(Y_true_test, Y_pred_b)
print(f"Model A margin accuracy (within 5 points): {margin_a:.3%}")
print(f"Model B margin accuracy (within 5 points): {margin_b:.3%}")

total_a = totals_accuracy(Y_true_test, Y_pred_a)
total_b = totals_accuracy(Y_true_test, Y_pred_b)
print(f"Model A totals accuracy (within 5 points): {total_a:.3%}")
print(f"Model B totals accuracy (within 5 points): {total_b:.3%}")


Model A winner accuracy: 61.594%
Model B winner accuracy: 61.522%
Model A margin accuracy (within 5 points): 30.145%
Model B margin accuracy (within 5 points): 28.986%
Model A totals accuracy (within 5 points): 23.043%
Model B totals accuracy (within 5 points): 22.246%


In [25]:
df_betting = pd.read_csv(f'{DATA_DIR}/nba_2008-2025.csv')
df_betting['game_date'] = pd.to_datetime(df_betting['date'])
df_betting = df_betting[df_betting['game_date'] >= ERA_START].reset_index(drop=True)

team_df = pd.read_csv(os.path.join(DATA_DIR, 'team.csv'))

In [26]:
team_map = dict(zip(team_df['id'], team_df['nickname']))

test_indices = np.where(test_mask)[0]
test_game_indices = []

# Rebuild to track which games_full indices match test_mask
idx = 0
for i, row in games_full.iterrows():
    gid = row[GAME_ID_COL]
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]
    date = row[GAME_DATE_COL]

    key_home = (gid, home_id)
    key_away = (gid, away_id)

    if key_home not in seq_index_by_game_team or key_away not in seq_index_by_game_team:
        continue
    
    # This game is in our dataset
    if pd.to_datetime(date) >= TEST_SPLIT_DATE:
        test_game_indices.append(i)
    
    idx += 1

# Build test_predictions_df with correct games_full rows
test_predictions_df = pd.DataFrame({
    'game_date': games_full.iloc[test_game_indices][GAME_DATE_COL].values,
    'home_team': [team_map.get(games_full.iloc[i]['home_team_id'], 'UNK') 
                  for i in test_game_indices],
    'away_team': [team_map.get(games_full.iloc[i]['away_team_id'], 'UNK') 
                  for i in test_game_indices],
    'y_home': true_home,
    'y_away': true_away,
    'pred_home_a': pred_home_a,
    'pred_away_a': pred_away_a,
    'pred_home_b': pred_home_b,
    'pred_away_b': pred_away_b,
}).reset_index(drop=True)


In [27]:
abbreviation_mapping = {
    'atl': 'Hawks',
    'bos': 'Celtics',
    'bkn': 'Nets',
    'cha': 'Hornets',
    'chi': 'Bulls',
    'cle': 'Cavaliers',
    'dal': 'Mavericks',
    'den': 'Nuggets',
    'det': 'Pistons',
    'gs': 'Warriors',
    'hou': 'Rockets',
    'ind': 'Pacers',
    'lac': 'Clippers',
    'lal': 'Lakers',
    'mem': 'Grizzlies',
    'mia': 'Heat',
    'mil': 'Bucks',
    'min': 'Timberwolves',
    'no': 'Pelicans',
    'ny': 'Knicks',
    'okc': 'Thunder',
    'orl': 'Magic',
    'phi': '76ers',
    'phx': 'Suns',
    'por': 'Trail Blazers',
    'sac': 'Kings',
    'sa': 'Spurs',
    'tor': 'Raptors',
    'utah': 'Jazz',
    'wsh': 'Wizards'
}

df_betting['away_team'] = df_betting['away'].map(abbreviation_mapping)
df_betting['home_team'] = df_betting['home'].map(abbreviation_mapping)
# Rename date to game_date to match test_predictions_df
df_betting['game_date'] = pd.to_datetime(df_betting['date'])
# Convert the spread to negative if the away team is favored
df_betting['spread'] = df_betting.apply(
    lambda row: -row['spread'] if 'away' == row['whos_favored'] else row['spread'],
    axis=1
)

# Print modified columns for verification
print(df_betting[['game_date', 'whos_favored', 'spread', 'home_team', 'away_team']])

       game_date whos_favored  spread      home_team away_team
0     2010-10-26         away    -1.0        Celtics      Heat
1     2010-10-26         home     7.0  Trail Blazers      Suns
2     2010-10-26         home     6.5         Lakers   Rockets
3     2010-10-27         home     4.0           Nets   Pistons
4     2010-10-27         away    -4.5      Cavaliers   Celtics
...          ...          ...     ...            ...       ...
19170 2025-06-11         away    -4.5         Pacers   Thunder
19171 2025-06-13         away    -6.5         Pacers   Thunder
19172 2025-06-16         home     8.5        Thunder    Pacers
19173 2025-06-19         away    -5.5         Pacers   Thunder
19174 2025-06-22         home     6.5        Thunder    Pacers

[19175 rows x 5 columns]


In [28]:
# Merge betting data with test predictions on date, home_team, away_team
merged_df = pd.merge(
    test_predictions_df,
    df_betting,
    on=['game_date', 'home_team', 'away_team'],
    how='inner'
)
print("Merged betting data shape:", merged_df.shape)

Merged betting data shape: (1314, 36)


In [29]:
def predict_betting_results(df):
    """
    The spread will always be from the home team's perspective, if it is negative the away team is favored.
    The total is the combined score of both teams.

    Returns:
        spread_record_a: wins-losses-pushes for model A spread bets
        total_record_a: wins-losses-pushes for model A total bets
        spread_record_b: wins-losses-pushes for model B spread bets
        total_record_b: wins-losses-pushes for model B total bets
    """
    for game in df:
        # Model A predictions
        pred_home_a = game['pred_home_a']
        pred_away_a = game['pred_away_a']
        pred_margin_a = pred_home_a - pred_away_a
        pred_total_a = pred_home_a + pred_away_a

        # Model B predictions
        pred_home_b = game['pred_home_b']
        pred_away_b = game['pred_away_b']
        pred_margin_b = pred_home_b - pred_away_b
        pred_total_b = pred_home_b + pred_away_b

        # Actual results
        actual_home = game['y_home']
        actual_away = game['y_away']
        actual_margin = actual_home - actual_away
        actual_total = actual_home + actual_away

        spread = game['spread']
        total_line = game['total']

        # Spread bet results for Model A
        if (spread) > 0: # Home team Favoured
            if (actual_margin - spread) > 0: # If home team covers
                if (pred_margin_a - spread) > 0: # Predicted home team covers
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            elif (actual_margin - spread) < 0: # If home team fails to cover
                if (pred_margin_a - spread) < 0: # Predicted home team fails to cover
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            else: # Push
                game['spread_result_a'] = 'P'
        elif (spread) < 0: # Away team Favoured
            if (actual_margin - spread) < 0: # If away team covers
                if (pred_margin_a - spread) < 0: # Predicted away team covers
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            elif (actual_margin - spread) > 0: # If away team fails to cover
                if (pred_margin_a - spread) > 0: # Predicted away team fails to cover
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            else: # Push
                game['spread_result_a'] = 'P'
        else: # Spread is 0, no favorite
            if (actual_margin and pred_margin_a) > 0: # Home team wins, predicted home team wins
                game['spread_result_a'] = 'W'
            elif (actual_margin and pred_margin_a) < 0: # Away team wins, predicted away team wins
                game['spread_result_a'] = 'W'
            else: # One team wins, predicted the other team wins
                game['spread_result_a'] = 'L'


         # Spread bet results for Model B
        if (spread) > 0: # Home team Favoured
            if (actual_margin - spread) > 0: # If home team covers
                if (pred_margin_b - spread) > 0: # Predicted home team covers
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            elif (actual_margin - spread) < 0: # If home team fails to cover
                if (pred_margin_b - spread) < 0: # Predicted home team fails to cover
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            else: # Push
                game['spread_result_b'] = 'P'
        elif (spread) < 0: # Away team Favoured
            if (actual_margin - spread) < 0: # If away team covers
                if (pred_margin_b - spread) < 0: # Predicted away team covers
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            elif (actual_margin - spread) > 0: # If away team fails to cover
                if (pred_margin_b - spread) > 0: # Predicted away team fails to cover
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            else: # Push
                game['spread_result_b'] = 'P'
        else: # Spread is 0, no favorite
            if (actual_margin and pred_margin_b) > 0: # Home team wins, predicted home team wins
                game['spread_result_b'] = 'W'
            elif (actual_margin and pred_margin_b) < 0: # Away team wins, predicted away team wins
                game['spread_result_b'] = 'W'
            else: # One team wins, predicted the other team wins
                game['spread_result_b'] = 'L'

        # Total bet results for Model A
        if (actual_total > total_line): # Over pays
            if (pred_total_a > total_line):
                game['total_result_a'] = 'W'
            else:
                game['total_result_a'] = 'L'
        elif (actual_total < total_line): # Under Pays
            if (pred_total_a < total_line):
                game['total_result_a'] = 'W'
            else:
                game['total_result_a'] = 'L'
        else: # Push = Exactly on the total line, bet refunded
            game['total_result_a'] = 'P'

        # Total bet results for Model B
        if (actual_total > total_line): # Over pays
            if (pred_total_b > total_line):
                game['total_result_b'] = 'W'
            else:
                game['total_result_b'] = 'L'
        elif (actual_total < total_line): # Under Pays
            if (pred_total_b < total_line):
                game['total_result_b'] = 'W'
            else:
                game['total_result_b'] = 'L'
        else: # Push = Exactly on the total line, bet refunded
            game['total_result_b'] = 'P'
            
    # Calculate records
    spread_record_a = {'W': 0, 'L': 0, 'P': 0}
    total_record_a = {'W': 0, 'L': 0, 'P': 0}
    spread_record_b = {'W': 0, 'L': 0, 'P': 0}
    total_record_b = {'W': 0, 'L': 0, 'P': 0}
    for game in df:
        spread_record_a[game['spread_result_a']] += 1
        total_record_a[game['total_result_a']] += 1
        spread_record_b[game['spread_result_b']] += 1
        total_record_b[game['total_result_b']] += 1
    return spread_record_a, total_record_a, spread_record_b, total_record_b

spread_record_a, total_record_a, spread_record_b, total_record_b = predict_betting_results(merged_df.to_dict('records'))
print("Model A Spread Record (W-L-P):", spread_record_a)
print("Model A Total Record (W-L-P):", total_record_a)
print("Model B Spread Record (W-L-P):", spread_record_b)
print("Model B Total Record (W-L-P):", total_record_b)
    

Model A Spread Record (W-L-P): {'W': 648, 'L': 653, 'P': 13}
Model A Total Record (W-L-P): {'W': 697, 'L': 608, 'P': 9}
Model B Spread Record (W-L-P): {'W': 664, 'L': 637, 'P': 13}
Model B Total Record (W-L-P): {'W': 677, 'L': 628, 'P': 9}


In [30]:
# create a summary betting DF
betting_summary = pd.DataFrame({
    'Model': ['A', 'A', 'B', 'B'],
    'Bet Type': ['Spread', 'Total', 'Spread', 'Total'],
    'Wins': [spread_record_a['W'], total_record_a['W'], spread_record_b['W'], total_record_b['W']],
    'Losses': [spread_record_a['L'], total_record_a['L'], spread_record_b['L'], total_record_b['L']],
    'Pushes': [spread_record_a['P'], total_record_a['P'], spread_record_b['P'], total_record_b['P']],
    'Win Ratio': [
        spread_record_a['W'] / (spread_record_a['W'] + spread_record_a['L']),
        total_record_a['W'] / (total_record_a['W'] + total_record_a['L']),
        spread_record_b['W'] / (spread_record_b['W'] + spread_record_b['L']),
        total_record_b['W'] / (total_record_b['W'] + total_record_b ['L']),
    ]
})
print(betting_summary)

  Model Bet Type  Wins  Losses  Pushes  Win Ratio
0     A   Spread   648     653      13   0.498078
1     A    Total   697     608       9   0.534100
2     B   Spread   664     637      13   0.510377
3     B    Total   677     628       9   0.518774
