In [1]:
# 1. Imports & config

import os
import math
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [2]:

DATA_DIR = "./data/csv"

# Hyperparameters
SEQ_LEN = 10              # number of past games per team
BATCH_SIZE = 64
HIDDEN_SIZE = 128
NUM_LAYERS = 1
LR = 1e-3
EPOCHS = 10
ERA_START = pd.to_datetime("2007-10-01")
VAL_SPLIT_DATE = "2021-10-01"
TEST_SPLIT_DATE = "2022-10-01"
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [3]:
files = [
    "common_player_info.csv",
    "game_info.csv",
    "officials.csv",
    "team.csv",
    "draft_combine_stats.csv",
    "game_summary.csv",
    "other_stats.csv",
    "team_details.csv",
    "draft_history.csv",
    "inactive_players.csv",
    "play_by_play.csv",
    "team_history.csv",
    "game.csv",
    "line_score.csv",
    "player.csv",
    "team_info_common.csv",
]

for fname in files:
    path = os.path.join(DATA_DIR, fname)
    df = pd.read_csv(path)
    print(f"\n=== {fname} ===")
    print("shape:", df.shape)
    print("columns:", list(df.columns)[:15], "...")


=== common_player_info.csv ===
shape: (4171, 33)
columns: ['person_id', 'first_name', 'last_name', 'display_first_last', 'display_last_comma_first', 'display_fi_last', 'player_slug', 'birthdate', 'school', 'country', 'last_affiliation', 'height', 'weight', 'season_exp', 'jersey'] ...

=== game_info.csv ===
shape: (58053, 4)
columns: ['game_id', 'game_date', 'attendance', 'game_time'] ...

=== officials.csv ===
shape: (70971, 5)
columns: ['game_id', 'official_id', 'first_name', 'last_name', 'jersey_num'] ...

=== team.csv ===
shape: (30, 7)
columns: ['id', 'full_name', 'abbreviation', 'nickname', 'city', 'state', 'year_founded'] ...

=== draft_combine_stats.csv ===
shape: (1202, 47)
columns: ['season', 'player_id', 'first_name', 'last_name', 'player_name', 'position', 'height_wo_shoes', 'height_wo_shoes_ft_in', 'height_w_shoes', 'height_w_shoes_ft_in', 'weight', 'wingspan', 'wingspan_ft_in', 'standing_reach', 'standing_reach_ft_in'] ...

=== game_summary.csv ===
shape: (58110, 14)
colu

In [4]:
# Core tables for modeling
games = pd.read_csv(os.path.join(DATA_DIR, "game.csv"))
game_info = pd.read_csv(os.path.join(DATA_DIR, "game_info.csv"))
other_stats = pd.read_csv(os.path.join(DATA_DIR, "other_stats.csv"))

print("games:", games.shape)
print("game_info:", game_info.shape)
print("other_stats:", other_stats.shape)

games: (65698, 55)
game_info: (58053, 4)
other_stats: (28271, 26)


In [5]:
# Column config for our pipeline
GAME_ID_COL = "game_id"
GAME_DATE_COL = "game_date"
HOME_TEAM_COL = "team_id_home"
AWAY_TEAM_COL = "team_id_away"
PTS_HOME_COL = "pts_home"
PTS_AWAY_COL = "pts_away"

# Make sure game_date is datetime
games[GAME_DATE_COL] = pd.to_datetime(games[GAME_DATE_COL])
game_info["game_date"] = pd.to_datetime(game_info["game_date"])

# Keep only modern-era games
mask_games = games[GAME_DATE_COL] >= ERA_START
games = games.loc[mask_games].reset_index(drop=True)

# Match game_info to the same window
mask_info = game_info["game_date"] >= ERA_START
game_info = game_info.loc[mask_info].reset_index(drop=True)

In [6]:
# 1) Select home/away feature columns, EXCLUDING the team_id columns
home_feature_cols = [
    c for c in games.columns
    if c.endswith("_home") and c != HOME_TEAM_COL
]

away_feature_cols = [
    c for c in games.columns
    if c.endswith("_away") and c != AWAY_TEAM_COL
]

print("Num home_feature_cols:", len(home_feature_cols))
print("Num away_feature_cols:", len(away_feature_cols))

# 2) Home rows
home_df = games[[GAME_ID_COL, GAME_DATE_COL, HOME_TEAM_COL] + home_feature_cols].copy()
home_df = home_df.rename(columns={HOME_TEAM_COL: "team_id"})
home_df["is_home"] = 1

for col in home_feature_cols:
    base = col.replace("_home", "")
    home_df[base] = home_df[col]

home_df["y_points"] = home_df[PTS_HOME_COL]

# 3) Away rows
away_df = games[[GAME_ID_COL, GAME_DATE_COL, AWAY_TEAM_COL] + away_feature_cols].copy()
away_df = away_df.rename(columns={AWAY_TEAM_COL: "team_id"})
away_df["is_home"] = 0

for col in away_feature_cols:
    base = col.replace("_away", "")
    away_df[base] = away_df[col]

away_df["y_points"] = away_df[PTS_AWAY_COL]

# 4) Keep only unified columns
keep_cols = [GAME_ID_COL, GAME_DATE_COL, "team_id", "is_home", "y_points"]
base_feature_names = sorted(
    {c.replace("_home", "").replace("_away", "") for c in home_feature_cols + away_feature_cols}
)
keep_cols += base_feature_names

home_df = home_df[keep_cols].copy()
away_df = away_df[keep_cols].copy()

# 5) Combine into team_games
team_games = pd.concat([home_df, away_df], axis=0).reset_index(drop=True)
team_games = team_games.sort_values(["team_id", GAME_DATE_COL]).reset_index(drop=True)

print("team_games initial:", team_games.shape)
team_games.head()

Num home_feature_cols: 24
Num away_feature_cols: 24
team_games initial: (41126, 29)


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,pf,plus_minus,pts,reb,stl,team_abbreviation,team_name,tov,video_available,wl
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,20.0,-27,80.0,36.0,7.0,MTA,Tel Aviv Maccabi Electra,9.0,0,L
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,27.0,-17,94.0,43.0,4.0,MTA,Tel Aviv Maccabi Electra,21.0,0,L
2,10700067,2007-10-18,45,1,92.0,15.0,4.0,22.0,0.419,31.0,...,19.0,-24,92.0,32.0,2.0,CHN,China Team China,17.0,0,L
3,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,30.0,-38,70.0,37.0,10.0,MAC,Haifa Maccabi Haifa,21.0,0,L
4,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,22.0,-8,100.0,38.0,9.0,MAC,Haifa Maccabi Haifa,14.0,0,L


In [7]:
# --- Tier 2: merge game_info (attendance + game_hour) ---

# Parse game_time to hour-of-day
game_info["game_hour"] = pd.to_datetime(
    game_info["game_time"],
    format="%I:%M %p",
    errors="coerce"
).dt.hour

# Keep only what we need
game_info_small = game_info[[GAME_ID_COL, "attendance", "game_hour"]].copy()

print("game_info_small sample:")
print(game_info_small.head())

# Merge into team_games
team_games = team_games.merge(
    game_info_small,
    on=GAME_ID_COL,
    how="left"
)

print("team_games after game_info:", team_games.shape)
team_games.head()


game_info_small sample:
    game_id  attendance  game_hour
0  20700001     18797.0        NaN
1  20700002     18997.0        NaN
2  20700003     19832.0        NaN
3  20700006     16212.0        NaN
4  20700009     17538.0        NaN
team_games after game_info: (41158, 31)


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,pts,reb,stl,team_abbreviation,team_name,tov,video_available,wl,attendance,game_hour
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,80.0,36.0,7.0,MTA,Tel Aviv Maccabi Electra,9.0,0,L,20562.0,
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,94.0,43.0,4.0,MTA,Tel Aviv Maccabi Electra,21.0,0,L,15915.0,
2,10700067,2007-10-18,45,1,92.0,15.0,4.0,22.0,0.419,31.0,...,92.0,32.0,2.0,CHN,China Team China,17.0,0,L,14125.0,
3,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,70.0,37.0,10.0,MAC,Haifa Maccabi Haifa,21.0,0,L,5174.0,
4,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,100.0,38.0,9.0,MAC,Haifa Maccabi Haifa,14.0,0,L,,


In [8]:
# --- Tier 3: merge other_stats (advanced team stats) ---

print("other_stats sample:")
print(other_stats.head())

# Game-level columns that apply to the whole game
game_level_cols = []
for col in ["lead_changes", "times_tied"]:
    if col in other_stats.columns:
        game_level_cols.append(col)

# Home/away advanced stat columns (excluding id/label columns)
home_stat_cols = [
    c for c in other_stats.columns
    if c.endswith("_home")
    and not c.startswith(("team_id_", "team_abbreviation_", "team_city_"))
]

away_stat_cols = [
    c for c in other_stats.columns
    if c.endswith("_away")
    and not c.startswith(("team_id_", "team_abbreviation_", "team_city_"))
]

print("home_stat_cols:", home_stat_cols)
print("away_stat_cols:", away_stat_cols)
print("game_level_cols:", game_level_cols)

# 4.1 Home advanced stats → unified format
home_adv = other_stats[["game_id", "team_id_home"] + game_level_cols + home_stat_cols].copy()
home_adv = home_adv.rename(columns={"team_id_home": "team_id"})

for col in home_stat_cols:
    base = col.replace("_home", "")
    home_adv[base] = home_adv[col]

home_keep_cols = ["game_id", "team_id"] + game_level_cols + [c.replace("_home", "") for c in home_stat_cols]
home_adv = home_adv[home_keep_cols]

# 4.2 Away advanced stats → unified format
away_adv = other_stats[["game_id", "team_id_away"] + game_level_cols + away_stat_cols].copy()
away_adv = away_adv.rename(columns={"team_id_away": "team_id"})

for col in away_stat_cols:
    base = col.replace("_away", "")
    away_adv[base] = away_adv[col]

away_keep_cols = ["game_id", "team_id"] + game_level_cols + [c.replace("_away", "") for c in away_stat_cols]
away_adv = away_adv[away_keep_cols]

print("home_adv shape:", home_adv.shape)
print("away_adv shape:", away_adv.shape)

# 4.3 Combine advanced stats
adv_long = pd.concat([home_adv, away_adv], axis=0).reset_index(drop=True)
print("adv_long shape:", adv_long.shape)
print(adv_long.head())

# 4.4 Merge advanced stats into team_games
team_games = team_games.merge(
    adv_long,
    on=["game_id", "team_id"],
    how="left"
)

print("team_games after other_stats:", team_games.shape)
team_games.head()


other_stats sample:
    game_id  league_id  team_id_home team_abbreviation_home team_city_home  \
0  29600012          0    1610612756                    PHX        Phoenix   
1  29600005          0    1610612737                    ATL        Atlanta   
2  29600002          0    1610612739                    CLE      Cleveland   
3  29600007          0    1610612754                    IND        Indiana   
4  29600013          0    1610612746                    LAC    Los Angeles   

   pts_paint_home  pts_2nd_chance_home  pts_fb_home  largest_lead_home  \
0              44                   18            2                  1   
1              32                    9            6                  0   
2              36                   14            6                 20   
3              34                   11            4                 10   
4              40                   19            2                 12   

   lead_changes  ...  team_abbreviation_away  team_city_away  pts_

Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,lead_changes,times_tied,pts_paint,pts_2nd_chance,pts_fb,largest_lead,team_turnovers,total_turnovers,team_rebounds,pts_off_to
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,1.0,3.0,32.0,12.0,3.0,3.0,0.0,9.0,5.0,8.0
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,0.0,0.0,54.0,15.0,12.0,0.0,1.0,21.0,7.0,28.0
2,10700067,2007-10-18,45,1,92.0,15.0,4.0,22.0,0.419,31.0,...,,,,,,,,,,
3,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,3.0,1.0,28.0,8.0,17.0,3.0,0.0,21.0,10.0,29.0
4,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,,,,,,,,,,


In [9]:
# --- Compute SEQ_FEATURES and scale ---

exclude_cols = {GAME_ID_COL, GAME_DATE_COL, "team_id", "y_points"}
numeric_cols = [
    c for c in team_games.columns
    if c not in exclude_cols and pd.api.types.is_numeric_dtype(team_games[c])
]

SEQ_FEATURES = numeric_cols
print("Number of sequence features:", len(SEQ_FEATURES))
print("First 30 SEQ_FEATURES:", SEQ_FEATURES[:30])

train_rows = team_games[team_games[GAME_DATE_COL] < VAL_SPLIT_DATE].copy()

scaler = StandardScaler()
scaler.fit(train_rows[SEQ_FEATURES].fillna(0.0))

team_games[SEQ_FEATURES] = scaler.transform(
    team_games[SEQ_FEATURES].fillna(0.0)
)

team_games.head()


Number of sequence features: 33
First 30 SEQ_FEATURES: ['is_home', 'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm', 'ft_pct', 'fta', 'ftm', 'oreb', 'pf', 'plus_minus', 'pts', 'reb', 'stl', 'tov', 'video_available', 'attendance', 'game_hour', 'lead_changes', 'times_tied', 'pts_paint', 'pts_2nd_chance', 'pts_fb', 'largest_lead', 'team_turnovers']


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,lead_changes,times_tied,pts_paint,pts_2nd_chance,pts_fb,largest_lead,team_turnovers,total_turnovers,team_rebounds,pts_off_to
0,11400003,2014-10-05,41,-1.0,80.0,-0.624866,0.854613,-0.797113,-0.909313,0.765658,...,-0.771294,-0.290805,-0.206005,0.17079,-0.981192,-0.743055,-0.666574,-0.454101,-0.456147,-0.676603
1,11400011,2014-10-07,41,-1.0,94.0,-1.377658,0.061927,-0.075189,-1.164628,0.209982,...,-0.957313,-1.007046,0.969903,0.605478,0.12115,-1.057336,0.645311,1.390861,0.021448,1.726232
2,10700067,2007-10-18,45,1.0,92.0,-1.377658,-0.334417,-1.879998,0.613124,0.765658,...,-0.957313,-1.007046,-1.916416,-1.567964,-1.348639,-1.057336,-0.666574,-1.837822,-1.650134,-1.637738
3,11000002,2010-10-03,93,-1.0,70.0,-1.565856,-1.127103,-1.338556,-0.360857,-0.567963,...,-0.399256,-0.768299,-0.419806,-0.408795,0.733562,-0.743055,-0.666574,1.390861,0.73784,1.846374
4,11200029,2012-10-11,93,-1.0,100.0,0.127926,0.061927,-1.158075,0.253791,-0.345693,...,-0.957313,-1.007046,-1.916416,-1.567964,-1.348639,-1.057336,-0.666574,-1.837822,-1.650134,-1.637738


In [10]:
team_sequences = []
team_targets = []
team_meta = []  # (game_id, team_id, game_date)

for team_id, group in team_games.groupby("team_id"):
    group = group.sort_values(GAME_DATE_COL).reset_index(drop=True)

    feats = group[SEQ_FEATURES].values           # [num_games, F]
    targets = group["y_points"].values
    game_ids = group[GAME_ID_COL].values
    dates = group[GAME_DATE_COL].values

    # require SEQ_LEN previous games
    for i in range(SEQ_LEN, len(group)):
        seq = feats[i-SEQ_LEN:i]
        y = targets[i]
        gid = game_ids[i]
        date = dates[i]

        team_sequences.append(seq)
        team_targets.append(y)
        team_meta.append((gid, team_id, date))

team_sequences = np.stack(team_sequences)          # [N_team_games, T, F]
team_targets = np.array(team_targets, dtype=np.float32)

print("team_sequences:", team_sequences.shape)
print("team_targets:", team_targets.shape)


team_sequences: (40814, 10, 33)
team_targets: (40814,)


In [11]:
seq_index_by_game_team = {
    (gid, tid): idx
    for idx, (gid, tid, date) in enumerate(team_meta)
}

len(seq_index_by_game_team)


40718

In [12]:
games_full = games[[GAME_ID_COL, GAME_DATE_COL, HOME_TEAM_COL, AWAY_TEAM_COL, PTS_HOME_COL, PTS_AWAY_COL]].copy()

games_full = games_full.rename(columns={
    HOME_TEAM_COL: "home_team_id",
    AWAY_TEAM_COL: "away_team_id",
    PTS_HOME_COL: "y_home",
    PTS_AWAY_COL: "y_away"
})

print("games_full:", games_full.shape)
games_full.head()


games_full: (20563, 6)


Unnamed: 0,game_id,game_date,home_team_id,away_team_id,y_home,y_away
0,20700001,2007-10-30,1610612759,1610612757,106.0,97.0
1,20700002,2007-10-30,1610612747,1610612745,93.0,95.0
2,20700003,2007-10-30,1610612744,1610612762,96.0,117.0
3,20700006,2007-10-31,1610612754,1610612764,119.0,110.0
4,20700009,2007-10-31,1610612763,1610612759,101.0,104.0


In [13]:
X_home = []
X_away = []
Y = []
GAME_DATES = []

for _, row in games_full.iterrows():
    gid = row[GAME_ID_COL]
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]
    date = row[GAME_DATE_COL]

    key_home = (gid, home_id)
    key_away = (gid, away_id)

    if key_home not in seq_index_by_game_team or key_away not in seq_index_by_game_team:
        continue  # skip early games

    idx_h = seq_index_by_game_team[key_home]
    idx_a = seq_index_by_game_team[key_away]

    X_home.append(team_sequences[idx_h])
    X_away.append(team_sequences[idx_a])
    Y.append([row["y_home"], row["y_away"]])
    GAME_DATES.append(date)

X_home = np.stack(X_home)
X_away = np.stack(X_away)
Y = np.array(Y, dtype=np.float32)
GAME_DATES = np.array(GAME_DATES)

print("Final dataset shapes:")
print("X_home:", X_home.shape)
print("X_away:", X_away.shape)
print("Y:", Y.shape)


Final dataset shapes:
X_home: (20326, 10, 33)
X_away: (20326, 10, 33)
Y: (20326, 2)


In [14]:
VAL_SPLIT_DATE = pd.to_datetime(VAL_SPLIT_DATE)
TEST_SPLIT_DATE = pd.to_datetime(TEST_SPLIT_DATE)

dates = pd.to_datetime(GAME_DATES)

train_mask = dates < VAL_SPLIT_DATE
val_mask = (dates >= VAL_SPLIT_DATE) & (dates < TEST_SPLIT_DATE)
test_mask = dates >= TEST_SPLIT_DATE

def split(arr):
    return arr[train_mask], arr[val_mask], arr[test_mask]

X_home_train, X_home_val, X_home_test = split(X_home)
X_away_train, X_away_val, X_away_test = split(X_away)
Y_train, Y_val, Y_test = split(Y)

print("Train:", len(Y_train), "Val:", len(Y_val), "Test:", len(Y_test))



Train: 17561 Val: 1385 Test: 1380


In [15]:
class GameSequenceDataset(Dataset):
    def __init__(self, x_home, x_away, y):
        self.x_home = torch.tensor(x_home, dtype=torch.float32)
        self.x_away = torch.tensor(x_away, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return self.x_home[idx], self.x_away[idx], self.y[idx]

train_dataset = GameSequenceDataset(X_home_train, X_away_train, Y_train)
val_dataset = GameSequenceDataset(X_home_val, X_away_val, Y_val)
test_dataset = GameSequenceDataset(X_home_test, X_away_test, Y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [16]:
class TeamSequenceEncoder(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )

    def forward(self, x):
        # x: [B, T, F]
        output, (h_n, c_n) = self.lstm(x)
        # output: [B, T, 2H]
        return output
    
class ScorePredictorMLP(nn.Module):
    """
    Model A — LSTM Encoder + Mean Pooling + MLP.
    No cross-attention between teams.
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int = 128,
        num_layers: int = 1
    ):
        super().__init__()

        self.embed_dim = hidden_size * 2  # BiLSTM output size

        ## Shared sequence encoder
        self.encoder = TeamSequenceEncoder(input_size, hidden_size, num_layers)

        ## MLP prediction head
        self.mlp = nn.Sequential(
            nn.Linear(self.embed_dim * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2)       # predict [home_score, away_score]
        )

    def forward(self, x_home, x_away):
        """
        x_home, x_away: [B, T, F]
        """
        # Encode sequences
        h_home_seq = self.encoder(x_home)  # [B, T, 2H]
        h_away_seq = self.encoder(x_away)  # [B, T, 2H]

        # Temporal mean pooling
        home_vec = h_home_seq.mean(dim=1)  # [B, 2H]
        away_vec = h_away_seq.mean(dim=1)  # [B, 2H]

        # Concatenate and predict
        pair_vec = torch.cat([home_vec, away_vec], dim=-1)  # [B, 4H]
        y_pred = self.mlp(pair_vec)                         # [B, 2]

        return y_pred



class ScorePredictorCrossAttention(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int = 128,
        num_layers: int = 1,
        num_heads: int = 4,
    ):
        super().__init__()
        self.embed_dim = hidden_size * 2  # BiLSTM

        self.encoder = TeamSequenceEncoder(input_size, hidden_size, num_layers)

        self.cross_attn = nn.MultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=num_heads,
            batch_first=True,
        )

        self.mlp = nn.Sequential(
            nn.Linear(self.embed_dim * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2),  # predict [y_home, y_away]
        )

    def forward(self, x_home, x_away):
        # Encode sequences
        h_home_seq = self.encoder(x_home)   # [B, T, 2H]
        h_away_seq = self.encoder(x_away)   # [B, T, 2H]

        # Home attends to away
        home_ctx, _ = self.cross_attn(
            query=h_home_seq,
            key=h_away_seq,
            value=h_away_seq,
        )

        # Away attends to home
        away_ctx, _ = self.cross_attn(
            query=h_away_seq,
            key=h_home_seq,
            value=h_home_seq,
        )

        # Pool over time
        home_vec = home_ctx.mean(dim=1)   # [B, 2H]
        away_vec = away_ctx.mean(dim=1)   # [B, 2H]

        pair_vec = torch.cat([home_vec, away_vec], dim=-1)  # [B, 4H]
        y_pred = self.mlp(pair_vec)                         # [B, 2]
        return y_pred


In [17]:
input_size = len(SEQ_FEATURES)

model_a = ScorePredictorMLP(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
).to(device)

model_b = ScorePredictorCrossAttention(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_heads=4,
).to(device)


print(model_a)
print(model_b)


ScorePredictorMLP(
  (encoder): TeamSequenceEncoder(
    (lstm): LSTM(33, 128, batch_first=True, bidirectional=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
ScorePredictorCrossAttention(
  (encoder): TeamSequenceEncoder(
    (lstm): LSTM(33, 128, batch_first=True, bidirectional=True)
  )
  (cross_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)


In [18]:
def run_epoch(loader, train: bool = True, model = None):
    if model == None:
        raise "model not set exception"
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_true = []
    all_pred = []

    for x_home, x_away, y in loader:
        x_home = x_home.to(device)
        x_away = x_away.to(device)
        y = y.to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            y_pred = model(x_home, x_away)
            loss = criterion(y_pred, y)

            if train:
                loss.backward()
                optimizer.step()

        total_loss += loss.item() * y.size(0)
        all_true.append(y.detach().cpu().numpy())
        all_pred.append(y_pred.detach().cpu().numpy())

    all_true = np.concatenate(all_true, axis=0)
    all_pred = np.concatenate(all_pred, axis=0)

    mae = mean_absolute_error(all_true, all_pred)
    rmse = math.sqrt(mean_squared_error(all_true, all_pred))
    avg_loss = total_loss / len(loader.dataset)

    return avg_loss, mae, rmse


In [19]:
# TRAIN MODEL A

criterion = nn.MSELoss()
optimizer = Adam(model_a.parameters(), lr=LR)

best_val_rmse = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae, train_rmse = run_epoch(train_loader, train=True, model=model_a)
    val_loss, val_mae, val_rmse = run_epoch(val_loader, train=False, model=model_a)

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.3f}, MAE {train_mae:.3f}, RMSE {train_rmse:.3f} | "
        f"Val Loss {val_loss:.3f}, MAE {val_mae:.3f}, RMSE {val_rmse:.3f}"
    )

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_state = model_a.state_dict()

model_a.load_state_dict(best_state)


Epoch 01 | Train Loss 1331.748, MAE 21.399, RMSE 36.493 | Val Loss 155.181, MAE 9.910, RMSE 12.457
Epoch 02 | Train Loss 137.242, MAE 9.277, RMSE 11.715 | Val Loss 153.081, MAE 9.840, RMSE 12.373
Epoch 03 | Train Loss 135.847, MAE 9.234, RMSE 11.655 | Val Loss 152.389, MAE 9.818, RMSE 12.345
Epoch 04 | Train Loss 135.002, MAE 9.208, RMSE 11.619 | Val Loss 152.649, MAE 9.832, RMSE 12.355
Epoch 05 | Train Loss 134.312, MAE 9.181, RMSE 11.589 | Val Loss 149.851, MAE 9.735, RMSE 12.241
Epoch 06 | Train Loss 133.342, MAE 9.169, RMSE 11.547 | Val Loss 153.683, MAE 9.853, RMSE 12.397
Epoch 07 | Train Loss 132.024, MAE 9.121, RMSE 11.490 | Val Loss 153.456, MAE 9.867, RMSE 12.388
Epoch 08 | Train Loss 131.309, MAE 9.097, RMSE 11.459 | Val Loss 149.366, MAE 9.713, RMSE 12.222
Epoch 09 | Train Loss 130.908, MAE 9.085, RMSE 11.441 | Val Loss 149.509, MAE 9.719, RMSE 12.227
Epoch 10 | Train Loss 129.336, MAE 9.033, RMSE 11.373 | Val Loss 147.509, MAE 9.642, RMSE 12.145


<All keys matched successfully>

In [20]:
test_loss, test_mae, test_rmse = run_epoch(test_loader, train=False, model=model_a)
print(f"Test Loss {test_loss:.3f}, MAE {test_mae:.3f}, RMSE {test_rmse:.3f}")

Test Loss 138.168, MAE 9.309, RMSE 11.754


In [21]:
# TRAIN MODEL B

criterion = nn.MSELoss()
optimizer = Adam(model_b.parameters(), lr=LR)

best_val_rmse = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae, train_rmse = run_epoch(train_loader, train=True, model=model_b)
    val_loss, val_mae, val_rmse = run_epoch(val_loader, train=False, model=model_b)

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.3f}, MAE {train_mae:.3f}, RMSE {train_rmse:.3f} | "
        f"Val Loss {val_loss:.3f}, MAE {val_mae:.3f}, RMSE {val_rmse:.3f}"
    )

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_state = model_b.state_dict()

if best_state is not None:
    model_b.load_state_dict(best_state)
    print(f"Loaded best model_b (val RMSE = {best_val_rmse:.3f})")


Epoch 01 | Train Loss 791.347, MAE 16.180, RMSE 28.131 | Val Loss 150.990, MAE 9.753, RMSE 12.288
Epoch 02 | Train Loss 137.274, MAE 9.325, RMSE 11.716 | Val Loss 163.879, MAE 10.167, RMSE 12.802
Epoch 03 | Train Loss 137.099, MAE 9.304, RMSE 11.709 | Val Loss 164.051, MAE 10.214, RMSE 12.808
Epoch 04 | Train Loss 136.054, MAE 9.279, RMSE 11.664 | Val Loss 152.127, MAE 9.807, RMSE 12.334
Epoch 05 | Train Loss 139.417, MAE 9.389, RMSE 11.807 | Val Loss 156.066, MAE 9.936, RMSE 12.493
Epoch 06 | Train Loss 137.600, MAE 9.334, RMSE 11.730 | Val Loss 159.442, MAE 10.057, RMSE 12.627
Epoch 07 | Train Loss 131.540, MAE 9.109, RMSE 11.469 | Val Loss 185.147, MAE 10.902, RMSE 13.607
Epoch 08 | Train Loss 129.785, MAE 9.042, RMSE 11.392 | Val Loss 151.675, MAE 9.768, RMSE 12.316
Epoch 09 | Train Loss 128.263, MAE 9.007, RMSE 11.325 | Val Loss 143.504, MAE 9.479, RMSE 11.979
Epoch 10 | Train Loss 130.125, MAE 9.054, RMSE 11.407 | Val Loss 169.598, MAE 10.368, RMSE 13.023
Loaded best model_b (val

In [22]:
test_loss, test_mae, test_rmse = run_epoch(test_loader, train=False, model=model_b)
print(f"Test Loss {test_loss:.3f}, MAE {test_mae:.3f}, RMSE {test_rmse:.3f}")


Test Loss 165.408, MAE 10.196, RMSE 12.861


In [23]:
# Using your train split
train_mean_scores = Y_train.mean(axis=0)   # [mean_home, mean_away]
print("Train mean scores (home, away):", train_mean_scores)

def evaluate_constant_baseline(Y_true, const_pred):
    const = np.tile(const_pred, (Y_true.shape[0], 1))
    mae = mean_absolute_error(Y_true, const)
    rmse = math.sqrt(mean_squared_error(Y_true, const))
    return mae, rmse

baseline_mae, baseline_rmse = evaluate_constant_baseline(Y_test, train_mean_scores)
print(f"Constant baseline | MAE {baseline_mae:.3f}, RMSE {baseline_rmse:.3f}")


Train mean scores (home, away): [104.517685 101.78714 ]
Constant baseline | MAE 13.357, RMSE 16.540


In [24]:
def get_predictions(model, loader):
    model.eval()
    all_true = []
    all_pred = []
    with torch.no_grad():
        for x_home, x_away, y in loader:
            x_home = x_home.to(device)
            x_away = x_away.to(device)
            y = y.to(device)

            y_pred = model(x_home, x_away)
            all_true.append(y.cpu().numpy())
            all_pred.append(y_pred.cpu().numpy())
    return np.concatenate(all_true, axis=0), np.concatenate(all_pred, axis=0)

# True/pred for model A
Y_true_test, Y_pred_a = get_predictions(model_a, test_loader)
_, Y_pred_b = get_predictions(model_b, test_loader)

def winner_accuracy(y_true, y_pred):
    true_margin = y_true[:, 0] - y_true[:, 1]   # home - away
    pred_margin = y_pred[:, 0] - y_pred[:, 1]

    true_winner = (true_margin > 0).astype(int)
    pred_winner = (pred_margin > 0).astype(int)

    acc = (true_winner == pred_winner).mean()
    return acc

def margin_accuracy(y_true, y_pred):
    true_margin = y_true[:, 0] - y_true[:, 1]   # home - away
    pred_margin = y_pred[:, 0] - y_pred[:, 1]
    
    margin_diff = np.abs(true_margin - pred_margin)
    correct_margin = (margin_diff < 5).mean()  # within 5 points
    return correct_margin

def totals_accuracy(y_true, y_pred):
    true_total = y_true[:, 0] + y_true[:, 1]
    pred_total = y_pred[:, 0] + y_pred[:, 1]

    total_diff = np.abs(true_total - pred_total)
    correct_total = (total_diff < 5).mean()  # within 5 points
    return correct_total

acc_a = winner_accuracy(Y_true_test, Y_pred_a)
acc_b = winner_accuracy(Y_true_test, Y_pred_b)
print(f"Model A winner accuracy: {acc_a:.3%}")
print(f"Model B winner accuracy: {acc_b:.3%}")

margin_a = margin_accuracy(Y_true_test, Y_pred_a)
margin_b = margin_accuracy(Y_true_test, Y_pred_b)
print(f"Model A margin accuracy (within 5 points): {margin_a:.3%}")
print(f"Model B margin accuracy (within 5 points): {margin_b:.3%}")

total_a = totals_accuracy(Y_true_test, Y_pred_a)
total_b = totals_accuracy(Y_true_test, Y_pred_b)
print(f"Model A totals accuracy (within 5 points): {total_a:.3%}")
print(f"Model B totals accuracy (within 5 points): {total_b:.3%}")




Model A winner accuracy: 59.783%
Model B winner accuracy: 57.391%
Model A margin accuracy (within 5 points): 25.507%
Model B margin accuracy (within 5 points): 27.464%
Model A totals accuracy (within 5 points): 23.406%
Model B totals accuracy (within 5 points): 18.333%


In [25]:
df_betting = pd.read_csv(f'{DATA_DIR}/nba_2008-2025.csv')
df_betting['game_date'] = pd.to_datetime(df_betting['date'])
df_betting = df_betting[df_betting['game_date'] >= ERA_START].reset_index(drop=True)

team_df = pd.read_csv(os.path.join(DATA_DIR, 'team.csv'))

In [26]:
team_map = dict(zip(team_df['id'], team_df['nickname']))

test_indices = np.where(test_mask)[0]
test_game_indices = []

# Rebuild to track which games_full indices match test_mask
idx = 0
for i, row in games_full.iterrows():
    gid = row[GAME_ID_COL]
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]
    date = row[GAME_DATE_COL]

    key_home = (gid, home_id)
    key_away = (gid, away_id)

    if key_home not in seq_index_by_game_team or key_away not in seq_index_by_game_team:
        continue
    
    # This game is in our dataset
    if pd.to_datetime(date) >= TEST_SPLIT_DATE:
        test_game_indices.append(i)
    
    idx += 1

# Build test_predictions_df with correct games_full rows
test_predictions_df = pd.DataFrame({
    'game_date': games_full.iloc[test_game_indices][GAME_DATE_COL].values,
    'home_team': [team_map.get(games_full.iloc[i]['home_team_id'], 'UNK') 
                  for i in test_game_indices],
    'away_team': [team_map.get(games_full.iloc[i]['away_team_id'], 'UNK') 
                  for i in test_game_indices],
    'y_home': Y_true_test[:, 0],
    'y_away': Y_true_test[:, 1],
    'pred_home_a': Y_pred_a[:, 0],
    'pred_away_a': Y_pred_a[:, 1],
    'pred_home_b': Y_pred_b[:, 0],
    'pred_away_b': Y_pred_b[:, 1],
}).reset_index(drop=True)

In [27]:
abbreviation_mapping = {
    'atl': 'Hawks',
    'bos': 'Celtics',
    'bkn': 'Nets',
    'cha': 'Hornets',
    'chi': 'Bulls',
    'cle': 'Cavaliers',
    'dal': 'Mavericks',
    'den': 'Nuggets',
    'det': 'Pistons',
    'gs': 'Warriors',
    'hou': 'Rockets',
    'ind': 'Pacers',
    'lac': 'Clippers',
    'lal': 'Lakers',
    'mem': 'Grizzlies',
    'mia': 'Heat',
    'mil': 'Bucks',
    'min': 'Timberwolves',
    'no': 'Pelicans',
    'ny': 'Knicks',
    'okc': 'Thunder',
    'orl': 'Magic',
    'phi': '76ers',
    'phx': 'Suns',
    'por': 'Trail Blazers',
    'sac': 'Kings',
    'sa': 'Spurs',
    'tor': 'Raptors',
    'utah': 'Jazz',
    'wsh': 'Wizards'
}

df_betting['away_team'] = df_betting['away'].map(abbreviation_mapping)
df_betting['home_team'] = df_betting['home'].map(abbreviation_mapping)
# Rename date to game_date to match test_predictions_df
df_betting['game_date'] = pd.to_datetime(df_betting['date'])
# Convert the spread to negative if the away team is favored
df_betting['spread'] = df_betting.apply(
    lambda row: -row['spread'] if 'away' == row['whos_favored'] else row['spread'],
    axis=1
)

# Print modified columns for verification
print(df_betting[['game_date', 'whos_favored', 'spread', 'home_team', 'away_team']])

       game_date whos_favored  spread home_team      away_team
0     2007-10-30         home    13.0     Spurs  Trail Blazers
1     2007-10-30         home     1.0  Warriors           Jazz
2     2007-10-30         away    -5.0    Lakers        Rockets
3     2007-10-31         home     6.5   Raptors          76ers
4     2007-10-31         away    -1.5    Pacers        Wizards
...          ...          ...     ...       ...            ...
23113 2025-06-11         away    -4.5    Pacers        Thunder
23114 2025-06-13         away    -6.5    Pacers        Thunder
23115 2025-06-16         home     8.5   Thunder         Pacers
23116 2025-06-19         away    -5.5    Pacers        Thunder
23117 2025-06-22         home     6.5   Thunder         Pacers

[23118 rows x 5 columns]


In [28]:
# Merge betting data with test predictions on date, home_team, away_team
merged_df = pd.merge(
    test_predictions_df,
    df_betting,
    on=['game_date', 'home_team', 'away_team'],
    how='inner'
)
print("Merged betting data shape:", merged_df.shape)

Merged betting data shape: (1314, 36)


In [134]:
def predict_betting_results(df):
    """
    The spread will always be from the home team's perspective, if it is negative the away team is favored.
    The total is the combined score of both teams.

    Returns:
        spread_record_a: wins-losses-pushes for model A spread bets
        total_record_a: wins-losses-pushes for model A total bets
        spread_record_b: wins-losses-pushes for model B spread bets
        total_record_b: wins-losses-pushes for model B total bets
    """
    for game in df:
        # Model A predictions
        pred_home_a = game['pred_home_a']
        pred_away_a = game['pred_away_a']
        pred_margin_a = pred_home_a - pred_away_a
        pred_total_a = pred_home_a + pred_away_a

        # Model B predictions
        pred_home_b = game['pred_home_b']
        pred_away_b = game['pred_away_b']
        pred_margin_b = pred_home_b - pred_away_b
        pred_total_b = pred_home_b + pred_away_b

        # Actual results
        actual_home = game['y_home']
        actual_away = game['y_away']
        actual_margin = actual_home - actual_away
        actual_total = actual_home + actual_away

        spread = game['spread']
        total_line = game['total']

        # Spread bet results for Model A
        if (spread) > 0: # Home team Favoured
            if (actual_margin - spread) > 0: # If home team covers
                if (pred_margin_a - spread) > 0: # Predicted home team covers
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            elif (actual_margin - spread) < 0: # If home team fails to cover
                if (pred_margin_a - spread) < 0: # Predicted home team fails to cover
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            else: # Push
                game['spread_result_a'] = 'P'
        elif (spread) < 0: # Away team Favoured
            if (actual_margin - spread) < 0: # If away team covers
                if (pred_margin_a - spread) < 0: # Predicted away team covers
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            elif (actual_margin - spread) > 0: # If away team fails to cover
                if (pred_margin_a - spread) > 0: # Predicted away team fails to cover
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            else: # Push
                game['spread_result_a'] = 'P'
        else: # Spread is 0, no favorite
            if (actual_margin and pred_margin_a) > 0: # Home team wins, predicted home team wins
                game['spread_result_a'] = 'W'
            elif (actual_margin and pred_margin_a) < 0: # Away team wins, predicted away team wins
                game['spread_result_a'] = 'W'
            else: # One team wins, predicted the other team wins
                game['spread_result_a'] = 'L'


         # Spread bet results for Model B
        if (spread) > 0: # Home team Favoured
            if (actual_margin - spread) > 0: # If home team covers
                if (pred_margin_b - spread) > 0: # Predicted home team covers
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            elif (actual_margin - spread) < 0: # If home team fails to cover
                if (pred_margin_b - spread) < 0: # Predicted home team fails to cover
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            else: # Push
                game['spread_result_b'] = 'P'
        elif (spread) < 0: # Away team Favoured
            if (actual_margin - spread) < 0: # If away team covers
                if (pred_margin_b - spread) < 0: # Predicted away team covers
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            elif (actual_margin - spread) > 0: # If away team fails to cover
                if (pred_margin_b - spread) > 0: # Predicted away team fails to cover
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            else: # Push
                game['spread_result_b'] = 'P'
        else: # Spread is 0, no favorite
            if (actual_margin and pred_margin_b) > 0: # Home team wins, predicted home team wins
                game['spread_result_b'] = 'W'
            elif (actual_margin and pred_margin_b) < 0: # Away team wins, predicted away team wins
                game['spread_result_b'] = 'W'
            else: # One team wins, predicted the other team wins
                game['spread_result_b'] = 'L'

        # Total bet results for Model A
        if (actual_total > total_line): # Over pays
            if (pred_total_a > total_line):
                game['total_result_a'] = 'W'
            else:
                game['total_result_a'] = 'L'
        elif (actual_total < total_line): # Under Pays
            if (pred_total_a < total_line):
                game['total_result_a'] = 'W'
            else:
                game['total_result_a'] = 'L'
        else: # Push = Exactly on the total line, bet refunded
            game['total_result_a'] = 'P'

        # Total bet results for Model B
        if (actual_total > total_line): # Over pays
            if (pred_total_b > total_line):
                game['total_result_b'] = 'W'
            else:
                game['total_result_b'] = 'L'
        elif (actual_total < total_line): # Under Pays
            if (pred_total_b < total_line):
                game['total_result_b'] = 'W'
            else:
                game['total_result_b'] = 'L'
        else: # Push = Exactly on the total line, bet refunded
            game['total_result_b'] = 'P'
            
    # Calculate records
    spread_record_a = {'W': 0, 'L': 0, 'P': 0}
    total_record_a = {'W': 0, 'L': 0, 'P': 0}
    spread_record_b = {'W': 0, 'L': 0, 'P': 0}
    total_record_b = {'W': 0, 'L': 0, 'P': 0}
    for game in df:
        spread_record_a[game['spread_result_a']] += 1
        total_record_a[game['total_result_a']] += 1
        spread_record_b[game['spread_result_b']] += 1
        total_record_b[game['total_result_b']] += 1
    return spread_record_a, total_record_a, spread_record_b, total_record_b

spread_record_a, total_record_a, spread_record_b, total_record_b = predict_betting_results(merged_df.to_dict('records'))
print("Model A Spread Record (W-L-P):", spread_record_a)
print("Model A Total Record (W-L-P):", total_record_a)
print("Model B Spread Record (W-L-P):", spread_record_b)
print("Model B Total Record (W-L-P):", total_record_b)
    

Model A Spread Record (W-L-P): {'W': 668, 'L': 633, 'P': 13}
Model A Total Record (W-L-P): {'W': 654, 'L': 651, 'P': 9}
Model B Spread Record (W-L-P): {'W': 668, 'L': 633, 'P': 13}
Model B Total Record (W-L-P): {'W': 674, 'L': 631, 'P': 9}


In [136]:
# create a summary betting DF
betting_summary = pd.DataFrame({
    'Model': ['A', 'A', 'B', 'B'],
    'Bet Type': ['Spread', 'Total', 'Spread', 'Total'],
    'Wins': [spread_record_a['W'], total_record_a['W'], spread_record_b['W'], total_record_b['W']],
    'Losses': [spread_record_a['L'], total_record_a['L'], spread_record_b['L'], total_record_b['L']],
    'Pushes': [spread_record_a['P'], total_record_a['P'], spread_record_b['P'], total_record_b['P']],
    'Win Ratio': [
        spread_record_a['W'] / (spread_record_a['W'] + spread_record_a['L']),
        total_record_a['W'] / (total_record_a['W'] + total_record_a['L']),
        spread_record_b['W'] / (spread_record_b['W'] + spread_record_b['L']),
        total_record_b['W'] / (total_record_b['W'] + total_record_b ['L']),
    ]
})
print(betting_summary)

  Model Bet Type  Wins  Losses  Pushes  Win Ratio
0     A   Spread   668     633      13   0.513451
1     A    Total   654     651       9   0.501149
2     B   Spread   668     633      13   0.513451
3     B    Total   674     631       9   0.516475
