In [1]:
# 1. Imports & config

import os
import math
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [2]:

DATA_DIR = "./data/csv"

# Hyperparameters
SEQ_LEN = 15              # number of past games per team
BATCH_SIZE = 64
HIDDEN_SIZE = 64
NUM_LAYERS = 2
LR = 1e-4
EPOCHS = 20
ERA_START = pd.to_datetime("2010-10-01")
VAL_SPLIT_DATE = "2021-10-01"
TEST_SPLIT_DATE = "2022-10-01"
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [3]:
files = [
    "common_player_info.csv",
    "game_info.csv",
    "officials.csv",
    "team.csv",
    "draft_combine_stats.csv",
    "game_summary.csv",
    "other_stats.csv",
    "team_details.csv",
    "draft_history.csv",
    "inactive_players.csv",
    "play_by_play.csv",
    "team_history.csv",
    "game.csv",
    "line_score.csv",
    "player.csv",
    "team_info_common.csv",
]

for fname in files:
    path = os.path.join(DATA_DIR, fname)
    df = pd.read_csv(path)
    print(f"\n=== {fname} ===")
    print("shape:", df.shape)
    print("columns:", list(df.columns)[:15], "...")


=== common_player_info.csv ===
shape: (4171, 33)
columns: ['person_id', 'first_name', 'last_name', 'display_first_last', 'display_last_comma_first', 'display_fi_last', 'player_slug', 'birthdate', 'school', 'country', 'last_affiliation', 'height', 'weight', 'season_exp', 'jersey'] ...

=== game_info.csv ===
shape: (58053, 4)
columns: ['game_id', 'game_date', 'attendance', 'game_time'] ...

=== officials.csv ===
shape: (70971, 5)
columns: ['game_id', 'official_id', 'first_name', 'last_name', 'jersey_num'] ...

=== team.csv ===
shape: (30, 7)
columns: ['id', 'full_name', 'abbreviation', 'nickname', 'city', 'state', 'year_founded'] ...

=== draft_combine_stats.csv ===
shape: (1202, 47)
columns: ['season', 'player_id', 'first_name', 'last_name', 'player_name', 'position', 'height_wo_shoes', 'height_wo_shoes_ft_in', 'height_w_shoes', 'height_w_shoes_ft_in', 'weight', 'wingspan', 'wingspan_ft_in', 'standing_reach', 'standing_reach_ft_in'] ...

=== game_summary.csv ===
shape: (58110, 14)
colu

In [4]:
# Core tables for modeling
games = pd.read_csv(os.path.join(DATA_DIR, "game.csv"))
game_info = pd.read_csv(os.path.join(DATA_DIR, "game_info.csv"))
other_stats = pd.read_csv(os.path.join(DATA_DIR, "other_stats.csv"))

print("games:", games.shape)
print("game_info:", game_info.shape)
print("other_stats:", other_stats.shape)

games: (65698, 55)
game_info: (58053, 4)
other_stats: (28271, 26)


In [5]:
# --- Player-level tables ---
player_df = pd.read_csv(os.path.join(DATA_DIR, "player.csv"))
common_player_info = pd.read_csv(os.path.join(DATA_DIR, "common_player_info.csv"))
play_by_play = pd.read_csv(os.path.join(DATA_DIR, "play_by_play.csv"))

print("player_df:", player_df.shape)
print("common_player_info:", common_player_info.shape)
print("play_by_play:", play_by_play.shape)

# Player ID mapping (reserve 0 for padding / unknown)
player_ids = sorted(player_df["id"].unique())
player_id_to_idx = {pid: i + 1 for i, pid in enumerate(player_ids)}
num_players = len(player_ids) + 1  # +1 for padding index 0

print("num_players (including padding):", num_players)


player_df: (4831, 5)
common_player_info: (4171, 33)
play_by_play: (13592899, 34)
num_players (including padding): 4832


In [6]:
# Column config for our pipeline
GAME_ID_COL = "game_id"
GAME_DATE_COL = "game_date"
HOME_TEAM_COL = "team_id_home"
AWAY_TEAM_COL = "team_id_away"
PTS_HOME_COL = "pts_home"
PTS_AWAY_COL = "pts_away"

# Make sure game_date is datetime
games[GAME_DATE_COL] = pd.to_datetime(games[GAME_DATE_COL])
game_info["game_date"] = pd.to_datetime(game_info["game_date"])

# Keep only modern-era games
mask_games = games[GAME_DATE_COL] >= ERA_START
games = games.loc[mask_games].reset_index(drop=True)

# Match game_info to the same window
mask_info = game_info["game_date"] >= ERA_START
game_info = game_info.loc[mask_info].reset_index(drop=True)

In [7]:
# --- Restrict PBP to games in our modeling window ---
valid_game_ids = set(games[GAME_ID_COL].unique())
play_by_play = play_by_play[play_by_play["game_id"].isin(valid_game_ids)].copy()

# Attach home/away team IDs to each pbp row
games_for_merge = games[[GAME_ID_COL, HOME_TEAM_COL, AWAY_TEAM_COL]].drop_duplicates()
play_by_play = play_by_play.merge(games_for_merge, on=GAME_ID_COL, how="left")


In [8]:
import numpy as np

# Infer team_id for each event where we have a player1_id
def infer_team(row):
    # home side event if homedescription is non-null
    if pd.notna(row.get("homedescription")) and row["homedescription"] != "":
        return row[HOME_TEAM_COL]
    # visitor side event if visitordescription is non-null
    if pd.notna(row.get("visitordescription")) and row["visitordescription"] != "":
        return row[AWAY_TEAM_COL]
    return np.nan

play_by_play["team_id_event"] = play_by_play.apply(infer_team, axis=1)

# Keep only rows where we can assign team + player
pbp_players = play_by_play.dropna(subset=["team_id_event", "player1_id"]).copy()
pbp_players["team_id_event"] = pbp_players["team_id_event"].astype(int)

# Count events per (game, team, player) as a crude "usage" proxy
pbp_players["event_count"] = 1
player_games = (
    pbp_players
    .groupby(["game_id", "team_id_event", "player1_id"], as_index=False)
    .agg(event_count=("event_count", "sum"))
    .rename(columns={"team_id_event": "team_id", "player1_id": "player_id"})
)

print("player_games (per game/team/player):", player_games.shape)
print(player_games.head())


player_games (per game/team/player): (428039, 4)
    game_id  team_id  player_id  event_count
0  11300001    12321        965            1
1  11300001    12321       2555            1
2  11300001    12321      12321           16
3  11300001    12321      42531           17
4  11300001    12321      42534            4


In [9]:
P = 10  # max players per side we keep (you can tweak this)

roster_by_game_team = {}

for (gid, tid), group in player_games.groupby(["game_id", "team_id"]):
    # sort players by event_count desc (proxy for minutes/importance)
    group = group.sort_values("event_count", ascending=False)
    player_ids_this = group["player_id"].astype(int).tolist()
    
    # truncate / pad to length P
    player_ids_this = player_ids_this[:P]
    while len(player_ids_this) < P:
        player_ids_this.append(0)  # 0 = padding / unknown player
    
    roster_by_game_team[(gid, tid)] = player_ids_this

print("Num (game, team) rosters:", len(roster_by_game_team))


Num (game, team) rosters: 29030


In [10]:
# 1) Select home/away feature columns, EXCLUDING the team_id columns
home_feature_cols = [
    c for c in games.columns
    if c.endswith("_home") and c != HOME_TEAM_COL
]

away_feature_cols = [
    c for c in games.columns
    if c.endswith("_away") and c != AWAY_TEAM_COL
]

print("Num home_feature_cols:", len(home_feature_cols))
print("Num away_feature_cols:", len(away_feature_cols))

# 2) Home rows
home_df = games[[GAME_ID_COL, GAME_DATE_COL, HOME_TEAM_COL] + home_feature_cols].copy()
home_df = home_df.rename(columns={HOME_TEAM_COL: "team_id"})
home_df["is_home"] = 1

for col in home_feature_cols:
    base = col.replace("_home", "")
    home_df[base] = home_df[col]

home_df["y_points"] = home_df[PTS_HOME_COL]

# 3) Away rows
away_df = games[[GAME_ID_COL, GAME_DATE_COL, AWAY_TEAM_COL] + away_feature_cols].copy()
away_df = away_df.rename(columns={AWAY_TEAM_COL: "team_id"})
away_df["is_home"] = 0

for col in away_feature_cols:
    base = col.replace("_away", "")
    away_df[base] = away_df[col]

away_df["y_points"] = away_df[PTS_AWAY_COL]

# 4) Keep only unified columns
keep_cols = [GAME_ID_COL, GAME_DATE_COL, "team_id", "is_home", "y_points"]
base_feature_names = sorted(
    {c.replace("_home", "").replace("_away", "") for c in home_feature_cols + away_feature_cols}
)
keep_cols += base_feature_names

home_df = home_df[keep_cols].copy()
away_df = away_df[keep_cols].copy()

# 5) Combine into team_games
team_games = pd.concat([home_df, away_df], axis=0).reset_index(drop=True)
team_games = team_games.sort_values(["team_id", GAME_DATE_COL]).reset_index(drop=True)

print("team_games initial:", team_games.shape)
team_games.head()

Num home_feature_cols: 24
Num away_feature_cols: 24
team_games initial: (32560, 29)


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,pf,plus_minus,pts,reb,stl,team_abbreviation,team_name,tov,video_available,wl
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,20.0,-27,80.0,36.0,7.0,MTA,Tel Aviv Maccabi Electra,9.0,0,L
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,27.0,-17,94.0,43.0,4.0,MTA,Tel Aviv Maccabi Electra,21.0,0,L
2,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,30.0,-38,70.0,37.0,10.0,MAC,Haifa Maccabi Haifa,21.0,0,L
3,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,22.0,-8,100.0,38.0,9.0,MAC,Haifa Maccabi Haifa,14.0,0,L
4,11200056,2012-10-16,93,0,81.0,17.0,4.0,33.0,0.37,27.0,...,30.0,-33,81.0,45.0,9.0,MAC,Haifa Maccabi Haifa,26.0,0,L


In [11]:
# --- Tier 2: merge game_info (attendance + game_hour) ---

# Parse game_time to hour-of-day
game_info["game_hour"] = pd.to_datetime(
    game_info["game_time"],
    format="%I:%M %p",
    errors="coerce"
).dt.hour

# Keep only what we need
game_info_small = game_info[[GAME_ID_COL, "attendance", "game_hour"]].copy()

print("game_info_small sample:")
print(game_info_small.head())

# Merge into team_games
team_games = team_games.merge(
    game_info_small,
    on=GAME_ID_COL,
    how="left"
)

print("team_games after game_info:", team_games.shape)
team_games.head()


game_info_small sample:
    game_id  attendance  game_hour
0  21000003     18997.0        NaN
1  21000002     20603.0        NaN
2  21000001     18624.0        NaN
3  21000015     18428.0        NaN
4  21000010     15039.0        NaN
team_games after game_info: (32584, 31)


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,pts,reb,stl,team_abbreviation,team_name,tov,video_available,wl,attendance,game_hour
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,80.0,36.0,7.0,MTA,Tel Aviv Maccabi Electra,9.0,0,L,20562.0,
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,94.0,43.0,4.0,MTA,Tel Aviv Maccabi Electra,21.0,0,L,15915.0,
2,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,70.0,37.0,10.0,MAC,Haifa Maccabi Haifa,21.0,0,L,5174.0,
3,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,100.0,38.0,9.0,MAC,Haifa Maccabi Haifa,14.0,0,L,,
4,11200056,2012-10-16,93,0,81.0,17.0,4.0,33.0,0.37,27.0,...,81.0,45.0,9.0,MAC,Haifa Maccabi Haifa,26.0,0,L,11192.0,


In [12]:
# --- Tier 3: merge other_stats (advanced team stats) ---

print("other_stats sample:")
print(other_stats.head())

# Game-level columns that apply to the whole game
game_level_cols = []
for col in ["lead_changes", "times_tied"]:
    if col in other_stats.columns:
        game_level_cols.append(col)

# Home/away advanced stat columns (excluding id/label columns)
home_stat_cols = [
    c for c in other_stats.columns
    if c.endswith("_home")
    and not c.startswith(("team_id_", "team_abbreviation_", "team_city_"))
]

away_stat_cols = [
    c for c in other_stats.columns
    if c.endswith("_away")
    and not c.startswith(("team_id_", "team_abbreviation_", "team_city_"))
]

print("home_stat_cols:", home_stat_cols)
print("away_stat_cols:", away_stat_cols)
print("game_level_cols:", game_level_cols)

# 4.1 Home advanced stats â†’ unified format
home_adv = other_stats[["game_id", "team_id_home"] + game_level_cols + home_stat_cols].copy()
home_adv = home_adv.rename(columns={"team_id_home": "team_id"})

for col in home_stat_cols:
    base = col.replace("_home", "")
    home_adv[base] = home_adv[col]

home_keep_cols = ["game_id", "team_id"] + game_level_cols + [c.replace("_home", "") for c in home_stat_cols]
home_adv = home_adv[home_keep_cols]

# 4.2 Away advanced stats â†’ unified format
away_adv = other_stats[["game_id", "team_id_away"] + game_level_cols + away_stat_cols].copy()
away_adv = away_adv.rename(columns={"team_id_away": "team_id"})

for col in away_stat_cols:
    base = col.replace("_away", "")
    away_adv[base] = away_adv[col]

away_keep_cols = ["game_id", "team_id"] + game_level_cols + [c.replace("_away", "") for c in away_stat_cols]
away_adv = away_adv[away_keep_cols]

print("home_adv shape:", home_adv.shape)
print("away_adv shape:", away_adv.shape)

# 4.3 Combine advanced stats
adv_long = pd.concat([home_adv, away_adv], axis=0).reset_index(drop=True)
print("adv_long shape:", adv_long.shape)
print(adv_long.head())

# 4.4 Merge advanced stats into team_games
team_games = team_games.merge(
    adv_long,
    on=["game_id", "team_id"],
    how="left"
)

print("team_games after other_stats:", team_games.shape)
team_games.head()


other_stats sample:
    game_id  league_id  team_id_home team_abbreviation_home team_city_home  \
0  29600012          0    1610612756                    PHX        Phoenix   
1  29600005          0    1610612737                    ATL        Atlanta   
2  29600002          0    1610612739                    CLE      Cleveland   
3  29600007          0    1610612754                    IND        Indiana   
4  29600013          0    1610612746                    LAC    Los Angeles   

   pts_paint_home  pts_2nd_chance_home  pts_fb_home  largest_lead_home  \
0              44                   18            2                  1   
1              32                    9            6                  0   
2              36                   14            6                 20   
3              34                   11            4                 10   
4              40                   19            2                 12   

   lead_changes  ...  team_abbreviation_away  team_city_away  pts_

Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,lead_changes,times_tied,pts_paint,pts_2nd_chance,pts_fb,largest_lead,team_turnovers,total_turnovers,team_rebounds,pts_off_to
0,11400003,2014-10-05,41,0,80.0,19.0,7.0,28.0,0.258,31.0,...,1.0,3.0,32.0,12.0,3.0,3.0,0.0,9.0,5.0,8.0
1,11400011,2014-10-07,41,0,94.0,15.0,5.0,32.0,0.231,26.0,...,0.0,0.0,54.0,15.0,12.0,0.0,1.0,21.0,7.0,28.0
2,11000002,2010-10-03,93,0,70.0,14.0,2.0,25.0,0.316,19.0,...,3.0,1.0,28.0,8.0,17.0,3.0,0.0,21.0,10.0,29.0
3,11200029,2012-10-11,93,0,100.0,23.0,5.0,26.0,0.381,21.0,...,,,,,,,,,,
4,11200056,2012-10-16,93,0,81.0,17.0,4.0,33.0,0.37,27.0,...,2.0,2.0,30.0,19.0,4.0,3.0,0.0,26.0,7.0,35.0


In [13]:
# --- Compute SEQ_FEATURES and scale ---

# --- Schedule features: rest / B2B / 3-in-4 / 4-in-6 ---

# Ensure sorted by team + date
team_games = team_games.sort_values(["team_id", GAME_DATE_COL]).reset_index(drop=True)

grouped = team_games.groupby("team_id")

# Previous game dates
team_games["prev_date"]  = grouped[GAME_DATE_COL].shift(1)
team_games["prev3_date"] = grouped[GAME_DATE_COL].shift(3)
team_games["prev4_date"] = grouped[GAME_DATE_COL].shift(4)

# Days of rest since last game
team_games["days_rest"] = (team_games[GAME_DATE_COL] - team_games["prev_date"]).dt.days

# Schedule intensity flags
team_games["is_b2b"]  = (team_games["days_rest"] == 1).astype(int)

team_games["is_3in4"] = (
    (team_games[GAME_DATE_COL] - team_games["prev3_date"]).dt.days <= 4
).astype(int)

team_games["is_4in6"] = (
    (team_games[GAME_DATE_COL] - team_games["prev4_date"]).dt.days <= 6
).astype(int)



# --- Team ID â†” index mapping for embeddings ---

team_ids = sorted(team_games["team_id"].unique())
team_id_to_idx = {tid: i for i, tid in enumerate(team_ids)}
num_teams = len(team_ids)

# Optional: store per-row team index (not used in SEQ_FEATURES)
team_games["team_idx"] = team_games["team_id"].map(team_id_to_idx)



exclude_cols = {
    GAME_ID_COL,
    GAME_DATE_COL,
    "team_id",
    "y_points",
    "prev_date",
    "prev3_date",
    "prev4_date",
    "team_idx",
    
    
    
    # non signals (i think)
    "video_available",
    "attedance"
}




numeric_cols = [
    c for c in team_games.columns
    if c not in exclude_cols and pd.api.types.is_numeric_dtype(team_games[c])
]

SEQ_FEATURES = numeric_cols
print("Number of sequence features:", len(SEQ_FEATURES))
print("First 30 SEQ_FEATURES:", SEQ_FEATURES[:30])

train_rows = team_games[team_games[GAME_DATE_COL] < VAL_SPLIT_DATE].copy()

scaler = StandardScaler()
scaler.fit(train_rows[SEQ_FEATURES].fillna(0.0))

team_games[SEQ_FEATURES] = scaler.transform(
    team_games[SEQ_FEATURES].fillna(0.0)
)

team_games.head()


Number of sequence features: 36
First 30 SEQ_FEATURES: ['is_home', 'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm', 'ft_pct', 'fta', 'ftm', 'oreb', 'pf', 'plus_minus', 'pts', 'reb', 'stl', 'tov', 'attendance', 'game_hour', 'lead_changes', 'times_tied', 'pts_paint', 'pts_2nd_chance', 'pts_fb', 'largest_lead', 'team_turnovers', 'total_turnovers']


Unnamed: 0,game_id,game_date,team_id,is_home,y_points,ast,blk,dreb,fg3_pct,fg3a,...,team_rebounds,pts_off_to,prev_date,prev3_date,prev4_date,days_rest,is_b2b,is_3in4,is_4in6,team_idx
0,11400003,2014-10-05,41,-1.0,80.0,-0.693698,0.849701,-0.903462,-0.953262,0.554806,...,-0.501082,-0.738944,NaT,NaT,NaT,-0.173476,-0.485414,-0.14156,-0.217296,0
1,11400011,2014-10-07,41,-1.0,94.0,-1.447486,0.057144,-0.180799,-1.222245,-0.002937,...,-0.015788,1.702937,2014-10-05,NaT,NaT,-0.104295,-0.485414,-0.14156,-0.217296,0
2,11000002,2010-10-03,93,-1.0,70.0,-1.635933,-1.131691,-1.445459,-0.375448,-0.783777,...,0.712153,1.825031,NaT,NaT,NaT,-0.173476,-0.485414,-0.14156,-0.217296,1
3,11200029,2012-10-11,93,-1.0,100.0,0.06009,0.057144,-1.264794,0.272103,-0.56068,...,-1.714317,-1.715697,2010-10-03,NaT,NaT,25.388655,-0.485414,-0.14156,-0.217296,1
4,11200056,2012-10-16,93,-1.0,81.0,-1.070592,-0.339134,-0.000133,0.162517,0.108612,...,-0.015788,2.557596,2012-10-11,NaT,NaT,-0.000525,-0.485414,-0.14156,-0.217296,1


In [14]:
team_sequences = []
team_targets = []
team_meta = []  # (game_id, team_id, game_date)

for team_id, group in team_games.groupby("team_id"):
    group = group.sort_values(GAME_DATE_COL).reset_index(drop=True)

    feats = group[SEQ_FEATURES].values           # [num_games, F]
    targets = group["y_points"].values
    game_ids = group[GAME_ID_COL].values
    dates = group[GAME_DATE_COL].values

    # require SEQ_LEN previous games
    for i in range(SEQ_LEN, len(group)):
        seq = feats[i-SEQ_LEN:i]
        y = targets[i]
        gid = game_ids[i]
        date = dates[i]

        team_sequences.append(seq)
        team_targets.append(y)
        team_meta.append((gid, team_id, date))

team_sequences = np.stack(team_sequences)          # [N_team_games, T, F]
team_targets = np.array(team_targets, dtype=np.float32)

print("team_sequences:", team_sequences.shape)
print("team_targets:", team_targets.shape)


team_sequences: (32090, 15, 36)
team_targets: (32090,)


In [15]:
seq_index_by_game_team = {
    (gid, tid): idx
    for idx, (gid, tid, date) in enumerate(team_meta)
}

len(seq_index_by_game_team)


32026

In [16]:
games_full = games[[GAME_ID_COL, GAME_DATE_COL, HOME_TEAM_COL, AWAY_TEAM_COL, PTS_HOME_COL, PTS_AWAY_COL]].copy()

games_full = games_full.rename(columns={
    HOME_TEAM_COL: "home_team_id",
    AWAY_TEAM_COL: "away_team_id",
    PTS_HOME_COL: "y_home",
    PTS_AWAY_COL: "y_away"
})

print("games_full:", games_full.shape)
games_full.head()

games_full: (16280, 6)


Unnamed: 0,game_id,game_date,home_team_id,away_team_id,y_home,y_away
0,21000003,2010-10-26,1610612747,1610612745,112.0,110.0
1,21000002,2010-10-26,1610612757,1610612756,106.0,92.0
2,21000001,2010-10-26,1610612738,1610612748,88.0,80.0
3,21000015,2010-10-27,1610612744,1610612745,132.0,128.0
4,21000010,2010-10-27,1610612740,1610612749,95.0,91.0


In [17]:
X_home = []
X_away = []
Y = []
GAME_DATES = []

HOME_TEAM_IDX = []
AWAY_TEAM_IDX = []

HOME_PLAYER_IDX_LIST = []
AWAY_PLAYER_IDX_LIST = []

for _, row in games_full.iterrows():
    gid = row[GAME_ID_COL]
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]
    date = row[GAME_DATE_COL]

    key_home = (gid, home_id)
    key_away = (gid, away_id)

    if key_home not in seq_index_by_game_team or key_away not in seq_index_by_game_team:
        continue  # skip early games (not enough history)

    idx_h = seq_index_by_game_team[key_home]
    idx_a = seq_index_by_game_team[key_away]

    X_home.append(team_sequences[idx_h])
    X_away.append(team_sequences[idx_a])

    # Targets: margin & total
    home = row["y_home"]
    away = row["y_away"]
    margin = home - away
    total  = home + away
    Y.append([margin, total])

    GAME_DATES.append(date)
    HOME_TEAM_IDX.append(team_id_to_idx[home_id])
    AWAY_TEAM_IDX.append(team_id_to_idx[away_id])

    # --- NEW: home/away player rosters as embedding indices ---
    home_roster_raw = roster_by_game_team.get(key_home, [0] * P)
    away_roster_raw = roster_by_game_team.get(key_away, [0] * P)

    # map raw player IDs -> embedding indices (0 reserved for padding)
    home_player_idx = [player_id_to_idx.get(pid, 0) for pid in home_roster_raw]
    away_player_idx = [player_id_to_idx.get(pid, 0) for pid in away_roster_raw]

    HOME_PLAYER_IDX_LIST.append(home_player_idx)
    AWAY_PLAYER_IDX_LIST.append(away_player_idx)

X_home = np.stack(X_home)
X_away = np.stack(X_away)
Y = np.array(Y, dtype=np.float32)
GAME_DATES = np.array(GAME_DATES)

HOME_TEAM_IDX = np.array(HOME_TEAM_IDX, dtype=np.int64)
AWAY_TEAM_IDX = np.array(AWAY_TEAM_IDX, dtype=np.int64)

HOME_PLAYER_IDX = np.array(HOME_PLAYER_IDX_LIST, dtype=np.int64)  # [N_games, P]
AWAY_PLAYER_IDX = np.array(AWAY_PLAYER_IDX_LIST, dtype=np.int64)

print("Final dataset shapes:")
print("X_home:", X_home.shape)
print("X_away:", X_away.shape)
print("Y:", Y.shape)
print("HOME_PLAYER_IDX:", HOME_PLAYER_IDX.shape)
print("AWAY_PLAYER_IDX:", AWAY_PLAYER_IDX.shape)


Final dataset shapes:
X_home: (15985, 15, 36)
X_away: (15985, 15, 36)
Y: (15985, 2)
HOME_PLAYER_IDX: (15985, 10)
AWAY_PLAYER_IDX: (15985, 10)


In [18]:
VAL_SPLIT_DATE = pd.to_datetime(VAL_SPLIT_DATE)
TEST_SPLIT_DATE = pd.to_datetime(TEST_SPLIT_DATE)

dates = pd.to_datetime(GAME_DATES)

train_mask = dates < VAL_SPLIT_DATE
val_mask = (dates >= VAL_SPLIT_DATE) & (dates < TEST_SPLIT_DATE)
test_mask = dates >= TEST_SPLIT_DATE

def split(arr):
    return arr[train_mask], arr[val_mask], arr[test_mask]

X_home_train, X_home_val, X_home_test = split(X_home)
X_away_train, X_away_val, X_away_test = split(X_away)
Y_train, Y_val, Y_test = split(Y)

home_idx_train, home_idx_val, home_idx_test = split(HOME_TEAM_IDX)
away_idx_train, away_idx_val, away_idx_test = split(AWAY_TEAM_IDX)

home_player_train, home_player_val, home_player_test = split(HOME_PLAYER_IDX)
away_player_train, away_player_val, away_player_test = split(AWAY_PLAYER_IDX)


print("Train:", len(Y_train), "Val:", len(Y_val), "Test:", len(Y_test))


# --- NEW: keep raw targets and create a scaler for [margin, total] ---
from sklearn.preprocessing import StandardScaler

train_mean_scores = Y_train.mean(axis=0)   # still raw here!
print("Train mean scores (margin, total):", train_mean_scores)

Y_train_raw = Y_train.copy()
Y_val_raw   = Y_val.copy()
Y_test_raw  = Y_test.copy()

y_scaler = StandardScaler()
y_scaler.fit(Y_train_raw)        # fit on train only

Y_train = y_scaler.transform(Y_train_raw)
Y_val   = y_scaler.transform(Y_val_raw)
Y_test  = y_scaler.transform(Y_test_raw)


Train: 13220 Val: 1385 Test: 1380
Train mean scores (margin, total): [  2.5335855 208.52057  ]


In [19]:
class GameSequenceDataset(Dataset):
    def __init__(self, x_home, x_away, y, home_idx, away_idx, home_players, away_players):
        self.x_home = torch.tensor(x_home, dtype=torch.float32)
        self.x_away = torch.tensor(x_away, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

        self.home_idx = torch.tensor(home_idx, dtype=torch.long)
        self.away_idx = torch.tensor(away_idx, dtype=torch.long)

        self.home_players = torch.tensor(home_players, dtype=torch.long)  # [N, P]
        self.away_players = torch.tensor(away_players, dtype=torch.long)

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return (
            self.x_home[idx],
            self.x_away[idx],
            self.y[idx],
            self.home_idx[idx],
            self.away_idx[idx],
            self.home_players[idx],
            self.away_players[idx],
        )

train_dataset = GameSequenceDataset(
    X_home_train, X_away_train, Y_train,
    home_idx_train, away_idx_train,
    home_player_train, away_player_train,
)
val_dataset = GameSequenceDataset(
    X_home_val, X_away_val, Y_val,
    home_idx_val, away_idx_val,
    home_player_val, away_player_val,
)
test_dataset = GameSequenceDataset(
    X_home_test, X_away_test, Y_test,
    home_idx_test, away_idx_test,
    home_player_test, away_player_test,
)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [20]:
sample = train_dataset[0]

x_home, x_away, y, home_idx, away_idx, home_players, away_players = sample

print("x_home shape:", x_home.shape)    # expected [SEQ_LEN, num_team_seq_features]
print("x_away shape:", x_away.shape)
print("y:", y)                          # expected shape [2] (margin, total)
print("home_idx:", home_idx)            # int
print("away_idx:", away_idx)            # int
print("home_players shape:", home_players.shape)  # expected [P]
print("away_players shape:", away_players.shape)


x_home shape: torch.Size([15, 36])
x_away shape: torch.Size([15, 36])
y: tensor([ 0.1791, -0.6802])
home_idx: tensor(31)
away_idx: tensor(34)
home_players shape: torch.Size([10])
away_players shape: torch.Size([10])


In [21]:
batch = next(iter(train_loader))

(
    bx_home, bx_away, by,
    bhome_idx, baway_idx,
    bhome_players, baway_players
) = batch

print("bx_home batch shape:", bx_home.shape)        # [B, 20, 37]
print("bx_away batch shape:", bx_away.shape)
print("by batch shape:", by.shape)                  # [B, 2]
print("bhome_players batch shape:", bhome_players.shape)  # [B, P]
print("baway_players batch shape:", baway_players.shape)


bx_home batch shape: torch.Size([64, 15, 36])
bx_away batch shape: torch.Size([64, 15, 36])
by batch shape: torch.Size([64, 2])
bhome_players batch shape: torch.Size([64, 10])
baway_players batch shape: torch.Size([64, 10])


In [22]:
print("max home_idx:", home_idx_train.max())
print("max away_idx:", away_idx_train.max())
print("num_teams:", num_teams)


max home_idx: 52
max away_idx: 52
num_teams: 53


In [23]:
print("max home_players:", home_player_train.max())
print("max away_players:", away_player_train.max())
print("num_players:", num_players)


max home_players: 4653
max away_players: 4653
num_players: 4832


In [24]:
unique_vals = np.unique(home_player_train[:1000])
print("Home roster unique values (first 1000 games):", unique_vals[:20])


Home roster unique values (first 1000 games): [  0 158 237 252 267 289 292 306 311 417 420 425 428 434 442 443 444 445
 516 558]


In [25]:
print("Train latest date:", GAME_DATES[train_mask].max())
print("Val earliest date:", GAME_DATES[val_mask].min())
print("Test earliest date:", GAME_DATES[test_mask].min())


Train latest date: 2021-07-20 00:00:00
Val earliest date: 2021-10-03 00:00:00
Test earliest date: 2022-10-01 00:00:00


In [26]:
idx = np.random.randint(len(train_dataset))
sample = train_dataset[idx]

_, _, _, home_idx, away_idx, home_players, away_players = sample

print("Team index:", home_idx.item(), away_idx.item())
print("Home players:", home_players[:10])
print("Away players:", away_players[:10])


Team index: 33 44
Home players: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Away players: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [27]:
gid, home_id, away_id = games_full.iloc[idx][['game_id','home_team_id','away_team_id']]
print("Original:", gid, home_id, away_id)
print("Mapped home idx:", home_idx.item(), "â†’", team_ids[home_idx.item()])


Original: 21600227 1610612739 1610612742
Mapped home idx: 33 â†’ 1610612749


In [28]:
assert not np.isnan(X_home).any()
assert not np.isinf(X_home).any()
assert not np.isnan(X_away).any()
assert not np.isinf(X_away).any()
assert not np.isnan(Y).any()
assert not np.isinf(Y).any()


In [29]:
for i in range(5):
    seq_h = X_home_train[i]
    seq_a = X_away_train[i]
    t = Y_train[i]
    print(i, seq_h.shape, seq_a.shape, t)


0 (15, 36) (15, 36) [ 0.1791383  -0.68023914]
1 (15, 36) (15, 36) [ 1.4138665 -1.3376623]
2 (15, 36) (15, 36) [-0.76506555 -1.1623495 ]
3 (15, 36) (15, 36) [-0.54717237 -0.50492626]
4 (15, 36) (15, 36) [-0.54717237  1.3358588 ]


In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TeamSequenceEncoder(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, dropout: float = 0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )

    def forward(self, x):
        # x: [B, T, F]
        output, (h_n, c_n) = self.lstm(x)
        # output: [B, T, 2H]
        return output


import torch
import torch.nn as nn
import torch.nn.functional as F

class ScorePredictorGNN(nn.Module):
    def __init__(
        self,
        input_size: int,          # seq feature dim (36 in your printout)
        hidden_size: int = 64,    # BiLSTM hidden
        num_layers: int = 3,
        num_teams: int = None,
        num_players: int = None,
        team_emb_dim: int = 16,
        player_emb_dim: int = 64,   # bump player dim
        gnn_hidden_dim: int = 128,  # graph node dim
        gnn_steps: int = 2,         # message passing steps
    ):
        super().__init__()
        self.gnn_steps = gnn_steps

        # --- Time encoder over team sequence (same as before) ---
        self.encoder = TeamSequenceEncoder(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=0.2,
        )
        seq_dim = hidden_size * 2  # BiLSTM

        # --- Embeddings ---
        self.team_embedding = nn.Embedding(num_teams, team_emb_dim)

        # IMPORTANT: padding_idx=0 so id 0 is ignored / kept zero
        self.player_embedding = nn.Embedding(
            num_players,
            player_emb_dim,
            padding_idx=0,
        )

        # --- Project into graph space ---
        # team node gets [seq_vec, team_id_emb] â†’ gnn_hidden_dim
        self.team_in = nn.Linear(seq_dim + team_emb_dim, gnn_hidden_dim)

        # player node gets just player_emb â†’ gnn_hidden_dim
        self.player_in = nn.Linear(player_emb_dim, gnn_hidden_dim)

        # --- Message-passing MLPs ---
        # Team update sees [team_node, mean(player_nodes)]
        self.team_update = nn.Linear(2 * gnn_hidden_dim, gnn_hidden_dim)

        # Player update sees [player_node, team_node_broadcast]
        self.player_update = nn.Linear(2 * gnn_hidden_dim, gnn_hidden_dim)

        # --- Final prediction head ---
        # Weâ€™ll use both teamsâ€™ final team_node and pooled player_node
        pair_input_dim = 4 * gnn_hidden_dim  # home_team, away_team, home_players, away_players

        self.mlp = nn.Sequential(
            nn.Linear(pair_input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 2),  # [margin, total]
        )
        
        # new
        self.dropout = nn.Dropout(0.3)

    def forward(
        self,
        x_home, x_away,           # [B, T, F]
        home_team_idx, away_team_idx,  # [B]
        home_players, away_players,    # [B, P] int64 with 0 as padding
    ):
        B = x_home.size(0)
        device = x_home.device

        # --- Encode team sequences ---
        h_home_seq = self.encoder(x_home)   # [B, T, 2H]
        h_away_seq = self.encoder(x_away)   # [B, T, 2H]

        home_seq_vec = h_home_seq.mean(dim=1)  # [B, 2H]
        away_seq_vec = h_away_seq.mean(dim=1)

        # --- Team ID embeddings ---
        home_team_emb = self.team_embedding(home_team_idx)  # [B, D_t]
        away_team_emb = self.team_embedding(away_team_idx)

        # --- Initial team nodes in graph space ---
        home_team_node = self.team_in(torch.cat([home_seq_vec, home_team_emb], dim=-1))
        away_team_node = self.team_in(torch.cat([away_seq_vec, away_team_emb], dim=-1))

        # --- Player embeddings / nodes ---
        # home_players: [B, P] -> [B, P, D_p]
        home_player_emb = self.player_embedding(home_players)  # padding_idx=0 â†’ zeros where 0
        away_player_emb = self.player_embedding(away_players)

        home_player_node = self.player_in(home_player_emb)   # [B, P, G]
        away_player_node = self.player_in(away_player_emb)

        # --- Masks for real players (id != 0) ---
        home_mask = (home_players != 0).unsqueeze(-1).float()  # [B, P, 1]
        away_mask = (away_players != 0).unsqueeze(-1).float()

        # Ensure padded players stay zero
        home_player_node = home_player_node * home_mask
        away_player_node = away_player_node * away_mask

        # --- Message passing ---
        for _ in range(self.gnn_steps):
            # Players â†’ Team: masked mean
            home_count = home_mask.sum(dim=1).clamp(min=1.0)  # [B, 1]
            away_count = away_mask.sum(dim=1).clamp(min=1.0)

            home_players_mean = (home_player_node * home_mask).sum(dim=1) / home_count  # [B, G]
            away_players_mean = (away_player_node * away_mask).sum(dim=1) / away_count

            # Team update (residual)
            home_team_msg = torch.cat([home_team_node, home_players_mean], dim=-1)  # [B, 2G]
            away_team_msg = torch.cat([away_team_node, away_players_mean], dim=-1)

            home_team_delta = F.relu(self.team_update(home_team_msg))
            away_team_delta = F.relu(self.team_update(away_team_msg))

            home_team_node = home_team_node + home_team_delta
            away_team_node = away_team_node + away_team_delta

            # Team â†’ Players: broadcast team node to each player
            home_team_broadcast = home_team_node.unsqueeze(1).expand_as(home_player_node)  # [B, P, G]
            away_team_broadcast = away_team_node.unsqueeze(1).expand_as(away_player_node)

            home_player_msg = torch.cat([home_player_node, home_team_broadcast], dim=-1)  # [B, P, 2G]
            away_player_msg = torch.cat([away_player_node, away_team_broadcast], dim=-1)

            home_player_delta = F.relu(self.player_update(home_player_msg))
            away_player_delta = F.relu(self.player_update(away_player_msg))

            # Residual + mask
            home_player_node = (home_player_node + home_player_delta) * home_mask
            away_player_node = (away_player_node + away_player_delta) * away_mask

        # --- Final pooling of players ---
        home_players_final = (home_player_node * home_mask).sum(dim=1) / home_count  # [B, G]
        away_players_final = (away_player_node * away_mask).sum(dim=1) / away_count

        # --- Final pairwise representation ---
        pair_vec = torch.cat([
            home_team_node,
            away_team_node,
            home_players_final,
            away_players_final,
        ], dim=-1)  # [B, 4G]

        # new extra dropout
        pair_vec = self.dropout(pair_vec)
        y_pred = self.mlp(pair_vec)  # [B, 2]
        return y_pred


class ScorePredictorCrossAttention(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int = 128,
        num_layers: int = 1,
        num_heads: int = 4,
        num_teams: int = None,
        team_emb_dim: int = 16,
    ):
        super().__init__()
        self.embed_dim = hidden_size * 2  # BiLSTM
        self.team_emb_dim = team_emb_dim

        self.encoder = TeamSequenceEncoder(input_size, hidden_size, num_layers)

        self.cross_attn = nn.MultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=num_heads,
            batch_first=True,
        )

        self.team_embedding = nn.Embedding(num_teams, team_emb_dim)

        pair_input_dim = self.embed_dim * 2 + team_emb_dim * 2

        self.mlp = nn.Sequential(
            nn.Linear(pair_input_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, 2),  # [margin, total]
        )

    def forward(self, x_home, x_away, home_team_idx, away_team_idx):
        # Encode sequences
        h_home_seq = self.encoder(x_home)   # [B, T, 2H]
        h_away_seq = self.encoder(x_away)   # [B, T, 2H]

        # Home attends to away
        home_ctx, _ = self.cross_attn(
            query=h_home_seq,
            key=h_away_seq,
            value=h_away_seq,
        )

        # Away attends to home
        away_ctx, _ = self.cross_attn(
            query=h_away_seq,
            key=h_home_seq,
            value=h_home_seq,
        )

        # Pool over time
        home_vec = home_ctx.mean(dim=1)   # [B, 2H]
        away_vec = away_ctx.mean(dim=1)   # [B, 2H]

        # Team embeddings
        home_emb = self.team_embedding(home_team_idx)  # [B, D]
        away_emb = self.team_embedding(away_team_idx)  # [B, D]

        pair_vec = torch.cat([home_vec, away_vec, home_emb, away_emb], dim=-1)
        y_pred = self.mlp(pair_vec)
        return y_pred



In [31]:
def run_epoch(loader, train: bool = True, model=None, use_players: bool = False):
    if model is None:
        raise RuntimeError("model not set")

    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_true = []
    all_pred = []

    for batch in loader:
        #
        # ---- UNPACK BASED ON FLAG ----
        #
        if use_players:
            (
                x_home, x_away, y,
                home_idx, away_idx,
                home_players, away_players,
            ) = batch

            x_home        = x_home.to(device)
            x_away        = x_away.to(device)
            y             = y.to(device)
            home_idx      = home_idx.to(device)
            away_idx      = away_idx.to(device)
            home_players  = home_players.to(device)
            away_players  = away_players.to(device)
        else:
            #
            # batch may be length 5 or 7 depending on DataLoader
            #
            if len(batch) == 5:
                x_home, x_away, y, home_idx, away_idx = batch
            else:
                # ignore extra fields if they exist
                x_home, x_away, y, home_idx, away_idx, *_ = batch

            x_home   = x_home.to(device)
            x_away   = x_away.to(device)
            y        = y.to(device)
            home_idx = home_idx.to(device)
            away_idx = away_idx.to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            if use_players:
                # ðŸ§  GNN-style model
                y_pred = model(
                    x_home, x_away,
                    home_idx, away_idx,
                    home_players, away_players,
                )
            else:
                # ðŸ§  non-player baseline model
                y_pred = model(
                    x_home, x_away,
                    home_idx, away_idx,
                )

            loss = criterion(y_pred, y)

            if train:
                loss.backward()
                optimizer.step()

        total_loss += loss.item() * y.size(0)
        all_true.append(y.detach().cpu().numpy())
        all_pred.append(y_pred.detach().cpu().numpy())

    all_true = np.concatenate(all_true, axis=0)
    all_pred = np.concatenate(all_pred, axis=0)

    # Unscale BEFORE metrics
    all_true_unscaled = y_scaler.inverse_transform(all_true)
    all_pred_unscaled = y_scaler.inverse_transform(all_pred)

    mae = mean_absolute_error(all_true_unscaled, all_pred_unscaled)
    rmse = math.sqrt(mean_squared_error(all_true_unscaled, all_pred_unscaled))
    avg_loss = total_loss / len(loader.dataset)

    return avg_loss, mae, rmse


In [32]:
input_size = len(SEQ_FEATURES)

P = HOME_PLAYER_IDX.shape[1]   # 10 in your current setup

model_a = ScorePredictorGNN(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_teams=num_teams,
    num_players=num_players,
    team_emb_dim=16,
    player_emb_dim=32,
    gnn_hidden_dim=64,
    gnn_steps=2,
).to(device)

model_b = ScorePredictorCrossAttention(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_heads=4,
    num_teams=num_teams,
    team_emb_dim=16,
).to(device)



print(model_a)
print(model_b)


ScorePredictorGNN(
  (encoder): TeamSequenceEncoder(
    (lstm): LSTM(36, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (team_embedding): Embedding(53, 16)
  (player_embedding): Embedding(4832, 32, padding_idx=0)
  (team_in): Linear(in_features=144, out_features=64, bias=True)
  (player_in): Linear(in_features=32, out_features=64, bias=True)
  (team_update): Linear(in_features=128, out_features=64, bias=True)
  (player_update): Linear(in_features=128, out_features=64, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
  (dropout): Dropout(p=0.3, inplace=False)
)
ScorePredictorCrossAttention(
  (encoder): TeamSequenceEncoder(
    (lstm): LSTM(36, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (cross_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(

In [33]:
criterion = nn.SmoothL1Loss()

optimizer = torch.optim.AdamW(
    model_a.parameters(),
    lr=3e-4,
    weight_decay=1e-4
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # lower RMSE is better
    factor=0.5,      # reduce LR by half
    patience=2,      # wait 2 epochs before dropping LR
)

best_val_rmse = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae, train_rmse = run_epoch(
        train_loader, train=True, model=model_a, use_players=True
    )
    val_loss, val_mae, val_rmse = run_epoch(
        val_loader, train=False, model=model_a, use_players=True
    )

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.3f}, MAE {train_mae:.3f}, RMSE {train_rmse:.3f} | "
        f"Val Loss {val_loss:.3f}, MAE {val_mae:.3f}, RMSE {val_rmse:.3f}"
    )

    # ðŸ”¥ IMPORTANT â†’ Notify the scheduler
    scheduler.step(val_rmse)  # or val_loss if you prefer loss

    # ðŸ”¥ Standard early stopping capture
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_state = model_a.state_dict()

Epoch 01 | Train Loss 0.368, MAE 13.137, RMSE 17.024 | Val Loss 0.392, MAE 13.441, RMSE 17.153
Epoch 02 | Train Loss 0.335, MAE 12.342, RMSE 16.017 | Val Loss 0.384, MAE 13.258, RMSE 16.930
Epoch 03 | Train Loss 0.330, MAE 12.247, RMSE 15.851 | Val Loss 0.384, MAE 13.301, RMSE 16.968
Epoch 04 | Train Loss 0.326, MAE 12.150, RMSE 15.732 | Val Loss 0.383, MAE 13.240, RMSE 16.881
Epoch 05 | Train Loss 0.324, MAE 12.092, RMSE 15.656 | Val Loss 0.384, MAE 13.290, RMSE 16.963
Epoch 06 | Train Loss 0.321, MAE 12.029, RMSE 15.588 | Val Loss 0.383, MAE 13.267, RMSE 16.934
Epoch 07 | Train Loss 0.320, MAE 12.011, RMSE 15.552 | Val Loss 0.386, MAE 13.350, RMSE 17.025
Epoch 08 | Train Loss 0.316, MAE 11.903, RMSE 15.425 | Val Loss 0.384, MAE 13.296, RMSE 16.959
Epoch 09 | Train Loss 0.314, MAE 11.879, RMSE 15.411 | Val Loss 0.384, MAE 13.291, RMSE 16.944
Epoch 10 | Train Loss 0.314, MAE 11.865, RMSE 15.399 | Val Loss 0.384, MAE 13.303, RMSE 16.955
Epoch 11 | Train Loss 0.311, MAE 11.799, RMSE 15.3

In [34]:
model_a.load_state_dict(best_state)
test_loss, test_mae, test_rmse = run_epoch(test_loader, train=False, model=model_a, use_players=True)
print(f"Test Loss {test_loss:.3f}, MAE {test_mae:.3f}, RMSE {test_rmse:.3f}")

Test Loss 0.360, MAE 12.869, RMSE 16.644


In [35]:
# TRAIN MODEL B
criterion = nn.SmoothL1Loss()

optimizer = torch.optim.AdamW(
    model_b.parameters(),
    lr=LR,
    weight_decay=1e-4
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # lower RMSE is better
    factor=0.5,      # reduce LR by half
    patience=2,      # wait 2 epochs before dropping LR
)

best_val_rmse = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae, train_rmse = run_epoch(
        train_loader, train=True, model=model_b, use_players=False
    )
    val_loss, val_mae, val_rmse = run_epoch(
        val_loader, train=False, model=model_b, use_players=False
    )

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.3f}, MAE {train_mae:.3f}, RMSE {train_rmse:.3f} | "
        f"Val Loss {val_loss:.3f}, MAE {val_mae:.3f}, RMSE {val_rmse:.3f}"
    )

    # ðŸ”¥ IMPORTANT â†’ Notify the scheduler
    scheduler.step(val_rmse)  # or val_loss if you prefer loss

    # ðŸ”¥ Standard early stopping capture
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_state = model_b.state_dict()

Epoch 01 | Train Loss 0.380, MAE 13.461, RMSE 17.379 | Val Loss 0.405, MAE 13.668, RMSE 17.421
Epoch 02 | Train Loss 0.351, MAE 12.705, RMSE 16.356 | Val Loss 0.392, MAE 13.407, RMSE 17.136
Epoch 03 | Train Loss 0.335, MAE 12.358, RMSE 15.990 | Val Loss 0.384, MAE 13.226, RMSE 16.911
Epoch 04 | Train Loss 0.332, MAE 12.274, RMSE 15.869 | Val Loss 0.386, MAE 13.298, RMSE 17.022
Epoch 05 | Train Loss 0.330, MAE 12.242, RMSE 15.830 | Val Loss 0.382, MAE 13.201, RMSE 16.874
Epoch 06 | Train Loss 0.328, MAE 12.200, RMSE 15.778 | Val Loss 0.382, MAE 13.184, RMSE 16.849
Epoch 07 | Train Loss 0.328, MAE 12.171, RMSE 15.762 | Val Loss 0.384, MAE 13.234, RMSE 16.923
Epoch 08 | Train Loss 0.327, MAE 12.154, RMSE 15.726 | Val Loss 0.382, MAE 13.197, RMSE 16.871
Epoch 09 | Train Loss 0.326, MAE 12.137, RMSE 15.699 | Val Loss 0.382, MAE 13.202, RMSE 16.889
Epoch 10 | Train Loss 0.324, MAE 12.091, RMSE 15.648 | Val Loss 0.383, MAE 13.209, RMSE 16.890
Epoch 11 | Train Loss 0.324, MAE 12.095, RMSE 15.6

In [36]:
model_b.load_state_dict(best_state)
test_loss, test_mae, test_rmse = run_epoch(test_loader, train=False, model=model_b, use_players=False)
print(f"Test Loss {test_loss:.3f}, MAE {test_mae:.3f}, RMSE {test_rmse:.3f}")


Test Loss 0.351, MAE 12.626, RMSE 16.375


In [37]:
# Using your train split


def evaluate_constant_baseline(Y_true, const_pred):
    const = np.tile(const_pred, (Y_true.shape[0], 1))
    mae = mean_absolute_error(Y_true, const)
    rmse = math.sqrt(mean_squared_error(Y_true, const))
    return mae, rmse

baseline_mae, baseline_rmse = evaluate_constant_baseline(Y_test_raw, train_mean_scores)
print(f"Constant baseline | MAE {baseline_mae:.3f}, RMSE {baseline_rmse:.3f}")



Constant baseline | MAE 16.986, RMSE 22.370


In [38]:
def get_predictions(model, loader, use_players: bool):
    model.eval()
    all_true = []
    all_pred = []

    with torch.no_grad():
        for batch in loader:
            if use_players:
                # New dataset: 7-tuple
                (
                    x_home, x_away, y,
                    home_idx, away_idx,
                    home_players, away_players,
                ) = batch

                x_home = x_home.to(device)
                x_away = x_away.to(device)
                y = y.to(device)
                home_idx = home_idx.to(device)
                away_idx = away_idx.to(device)
                home_players = home_players.to(device)
                away_players = away_players.to(device)

                # âœ… GNN / player-aware model
                y_pred = model(
                    x_home, x_away,
                    home_idx, away_idx,
                    home_players, away_players,
                )

            else:
                # Old models (5-input), but handle both 5- and 7-tuples gracefully
                if len(batch) == 5:
                    x_home, x_away, y, home_idx, away_idx = batch
                elif len(batch) >= 7:
                    x_home, x_away, y, home_idx, away_idx, *_ = batch
                else:
                    raise ValueError(f"Unexpected batch length: {len(batch)}")

                x_home = x_home.to(device)
                x_away = x_away.to(device)
                y = y.to(device)
                home_idx = home_idx.to(device)
                away_idx = away_idx.to(device)

                # âœ… old MLP / cross-attn model
                y_pred = model(x_home, x_away, home_idx, away_idx)

            all_true.append(y.cpu().numpy())
            all_pred.append(y_pred.cpu().numpy())

    all_true = np.concatenate(all_true, axis=0)  # scaled
    all_pred = np.concatenate(all_pred, axis=0)  # scaled

    # Unscale before returning
    all_true_unscaled = y_scaler.inverse_transform(all_true)
    all_pred_unscaled = y_scaler.inverse_transform(all_pred)

    return all_true_unscaled, all_pred_unscaled


Y_true_test, Y_pred_a = get_predictions(model_a, test_loader, use_players=True)
_, Y_pred_b = get_predictions(model_b, test_loader, use_players=False)


# True margin/total
true_margin = Y_true_test[:, 0]
true_total  = Y_true_test[:, 1]

# Reconstruct true scores
true_home = (true_total + true_margin) / 2
true_away = (true_total - true_margin) / 2

# Model A margin/total
pred_margin_a = Y_pred_a[:, 0]
pred_total_a  = Y_pred_a[:, 1]
pred_home_a   = (pred_total_a + pred_margin_a) / 2
pred_away_a   = (pred_total_a - pred_margin_a) / 2

# Model B margin/total
pred_margin_b = Y_pred_b[:, 0]
pred_total_b  = Y_pred_b[:, 1]
pred_home_b   = (pred_total_b + pred_margin_b) / 2
pred_away_b   = (pred_total_b - pred_margin_b) / 2


def winner_accuracy(y_true, y_pred):
    true_margin = y_true[:, 0]
    pred_margin = y_pred[:, 0]
    return ((true_margin > 0) == (pred_margin > 0)).mean()

def margin_accuracy(y_true, y_pred):
    true_margin = y_true[:, 0]
    pred_margin = y_pred[:, 0]
    return (np.abs(true_margin - pred_margin) < 5).mean()

def totals_accuracy(y_true, y_pred):
    true_total = y_true[:, 1]
    pred_total = y_pred[:, 1]
    return (np.abs(true_total - pred_total) < 5).mean()


acc_a = winner_accuracy(Y_true_test, Y_pred_a)
acc_b = winner_accuracy(Y_true_test, Y_pred_b)
print(f"Model A winner accuracy: {acc_a:.3%}")
print(f"Model B winner accuracy: {acc_b:.3%}")

margin_a = margin_accuracy(Y_true_test, Y_pred_a)
margin_b = margin_accuracy(Y_true_test, Y_pred_b)
print(f"Model A margin accuracy (within 5 points): {margin_a:.3%}")
print(f"Model B margin accuracy (within 5 points): {margin_b:.3%}")

total_a = totals_accuracy(Y_true_test, Y_pred_a)
total_b = totals_accuracy(Y_true_test, Y_pred_b)
print(f"Model A totals accuracy (within 5 points): {total_a:.3%}")
print(f"Model B totals accuracy (within 5 points): {total_b:.3%}")


Model A winner accuracy: 61.377%
Model B winner accuracy: 60.942%
Model A margin accuracy (within 5 points): 29.203%
Model B margin accuracy (within 5 points): 29.855%
Model A totals accuracy (within 5 points): 20.870%
Model B totals accuracy (within 5 points): 22.971%


In [39]:
df_betting = pd.read_csv(f'{DATA_DIR}/nba_2008-2025.csv')
df_betting['game_date'] = pd.to_datetime(df_betting['date'])
df_betting = df_betting[df_betting['game_date'] >= ERA_START].reset_index(drop=True)

team_df = pd.read_csv(os.path.join(DATA_DIR, 'team.csv'))

In [40]:
team_map = dict(zip(team_df['id'], team_df['nickname']))

test_indices = np.where(test_mask)[0]
test_game_indices = []

# Rebuild to track which games_full indices match test_mask
idx = 0
for i, row in games_full.iterrows():
    gid = row[GAME_ID_COL]
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]
    date = row[GAME_DATE_COL]

    key_home = (gid, home_id)
    key_away = (gid, away_id)

    if key_home not in seq_index_by_game_team or key_away not in seq_index_by_game_team:
        continue
    
    # This game is in our dataset
    if pd.to_datetime(date) >= TEST_SPLIT_DATE:
        test_game_indices.append(i)
    
    idx += 1

# Build test_predictions_df with correct games_full rows
test_predictions_df = pd.DataFrame({
    'game_date': games_full.iloc[test_game_indices][GAME_DATE_COL].values,
    'home_team': [team_map.get(games_full.iloc[i]['home_team_id'], 'UNK') 
                  for i in test_game_indices],
    'away_team': [team_map.get(games_full.iloc[i]['away_team_id'], 'UNK') 
                  for i in test_game_indices],
    'y_home': true_home,
    'y_away': true_away,
    'pred_home_a': pred_home_a,
    'pred_away_a': pred_away_a,
    'pred_home_b': pred_home_b,
    'pred_away_b': pred_away_b,
}).reset_index(drop=True)


In [41]:
abbreviation_mapping = {
    'atl': 'Hawks',
    'bos': 'Celtics',
    'bkn': 'Nets',
    'cha': 'Hornets',
    'chi': 'Bulls',
    'cle': 'Cavaliers',
    'dal': 'Mavericks',
    'den': 'Nuggets',
    'det': 'Pistons',
    'gs': 'Warriors',
    'hou': 'Rockets',
    'ind': 'Pacers',
    'lac': 'Clippers',
    'lal': 'Lakers',
    'mem': 'Grizzlies',
    'mia': 'Heat',
    'mil': 'Bucks',
    'min': 'Timberwolves',
    'no': 'Pelicans',
    'ny': 'Knicks',
    'okc': 'Thunder',
    'orl': 'Magic',
    'phi': '76ers',
    'phx': 'Suns',
    'por': 'Trail Blazers',
    'sac': 'Kings',
    'sa': 'Spurs',
    'tor': 'Raptors',
    'utah': 'Jazz',
    'wsh': 'Wizards'
}

df_betting['away_team'] = df_betting['away'].map(abbreviation_mapping)
df_betting['home_team'] = df_betting['home'].map(abbreviation_mapping)
# Rename date to game_date to match test_predictions_df
df_betting['game_date'] = pd.to_datetime(df_betting['date'])
# Convert the spread to negative if the away team is favored
df_betting['spread'] = df_betting.apply(
    lambda row: -row['spread'] if 'away' == row['whos_favored'] else row['spread'],
    axis=1
)

# Print modified columns for verification
print(df_betting[['game_date', 'whos_favored', 'spread', 'home_team', 'away_team']])

       game_date whos_favored  spread      home_team away_team
0     2010-10-26         away    -1.0        Celtics      Heat
1     2010-10-26         home     7.0  Trail Blazers      Suns
2     2010-10-26         home     6.5         Lakers   Rockets
3     2010-10-27         home     4.0           Nets   Pistons
4     2010-10-27         away    -4.5      Cavaliers   Celtics
...          ...          ...     ...            ...       ...
19170 2025-06-11         away    -4.5         Pacers   Thunder
19171 2025-06-13         away    -6.5         Pacers   Thunder
19172 2025-06-16         home     8.5        Thunder    Pacers
19173 2025-06-19         away    -5.5         Pacers   Thunder
19174 2025-06-22         home     6.5        Thunder    Pacers

[19175 rows x 5 columns]


In [42]:
# Merge betting data with test predictions on date, home_team, away_team
merged_df = pd.merge(
    test_predictions_df,
    df_betting,
    on=['game_date', 'home_team', 'away_team'],
    how='inner'
)
print("Merged betting data shape:", merged_df.shape)

Merged betting data shape: (1314, 36)


In [48]:
def predict_betting_results(df):
    """
    The spread will always be from the home team's perspective, if it is negative the away team is favored.
    The total is the combined score of both teams.

    Returns:
        spread_record_a: wins-losses-pushes for model A spread bets
        total_record_a: wins-losses-pushes for model A total bets
        spread_record_b: wins-losses-pushes for model B spread bets
        total_record_b: wins-losses-pushes for model B total bets
    """
    for game in df:
        # Model A predictions
        pred_home_a = game['pred_home_a']
        pred_away_a = game['pred_away_a']
        pred_margin_a = pred_home_a - pred_away_a
        pred_total_a = pred_home_a + pred_away_a

        # Model B predictions
        pred_home_b = game['pred_home_b']
        pred_away_b = game['pred_away_b']
        pred_margin_b = pred_home_b - pred_away_b
        pred_total_b = pred_home_b + pred_away_b

        # Actual results
        actual_home = game['y_home']
        actual_away = game['y_away']
        actual_margin = actual_home - actual_away
        actual_total = actual_home + actual_away

        spread = game['spread']
        total_line = game['total']

        # Spread bet results for Model A
        if (spread) > 0: # Home team Favoured
            if (actual_margin - spread) > 0: # If home team covers
                if (pred_margin_a - spread) > 0: # Predicted home team covers
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            elif (actual_margin - spread) < 0: # If home team fails to cover
                if (pred_margin_a - spread) < 0: # Predicted home team fails to cover
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            else: # Push
                game['spread_result_a'] = 'P'
        elif (spread) < 0: # Away team Favoured
            if (actual_margin - spread) < 0: # If away team covers
                if (pred_margin_a - spread) < 0: # Predicted away team covers
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            elif (actual_margin - spread) > 0: # If away team fails to cover
                if (pred_margin_a - spread) > 0: # Predicted away team fails to cover
                    game['spread_result_a'] = 'W'
                else:
                    game['spread_result_a'] = 'L'
            else: # Push
                game['spread_result_a'] = 'P'
        else: # Spread is 0, no favorite
            if actual_margin > 0 and pred_margin_a > 0: # Home team wins, predicted home team wins
                game['spread_result_a'] = 'W'
            elif actual_margin < 0 and pred_margin_a < 0: # Away team wins, predicted away team wins
                game['spread_result_a'] = 'W'
            else: # One team wins, predicted the other team wins
                game['spread_result_a'] = 'L'


         # Spread bet results for Model B
        if (spread) > 0: # Home team Favoured
            if (actual_margin - spread) > 0: # If home team covers
                if (pred_margin_b - spread) > 0: # Predicted home team covers
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            elif (actual_margin - spread) < 0: # If home team fails to cover
                if (pred_margin_b - spread) < 0: # Predicted home team fails to cover
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            else: # Push
                game['spread_result_b'] = 'P'
        elif (spread) < 0: # Away team Favoured
            if (actual_margin - spread) < 0: # If away team covers
                if (pred_margin_b - spread) < 0: # Predicted away team covers
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            elif (actual_margin - spread) > 0: # If away team fails to cover
                if (pred_margin_b - spread) > 0: # Predicted away team fails to cover
                    game['spread_result_b'] = 'W'
                else:
                    game['spread_result_b'] = 'L'
            else: # Push
                game['spread_result_b'] = 'P'
        else: # Spread is 0, no favorite
            if actual_margin > 0 and pred_margin_a > 0: # Home team wins, predicted home team wins
                game['spread_result_b'] = 'W'
            elif actual_margin < 0 and pred_margin_a < 0: # Away team wins, predicted away team wins
                game['spread_result_b'] = 'W'
            else: # One team wins, predicted the other team wins
                game['spread_result_b'] = 'L'

        # Total bet results for Model A
        if (actual_total > total_line): # Over pays
            if (pred_total_a > total_line):
                game['total_result_a'] = 'W'
            else:
                game['total_result_a'] = 'L'
        elif (actual_total < total_line): # Under Pays
            if (pred_total_a < total_line):
                game['total_result_a'] = 'W'
            else:
                game['total_result_a'] = 'L'
        else: # Push = Exactly on the total line, bet refunded
            game['total_result_a'] = 'P'

        # Total bet results for Model B
        if (actual_total > total_line): # Over pays
            if (pred_total_b > total_line):
                game['total_result_b'] = 'W'
            else:
                game['total_result_b'] = 'L'
        elif (actual_total < total_line): # Under Pays
            if (pred_total_b < total_line):
                game['total_result_b'] = 'W'
            else:
                game['total_result_b'] = 'L'
        else: # Push = Exactly on the total line, bet refunded
            game['total_result_b'] = 'P'
            
    # Calculate records
    spread_record_a = {'W': 0, 'L': 0, 'P': 0}
    total_record_a = {'W': 0, 'L': 0, 'P': 0}
    spread_record_b = {'W': 0, 'L': 0, 'P': 0}
    total_record_b = {'W': 0, 'L': 0, 'P': 0}
    for game in df:
        spread_record_a[game['spread_result_a']] += 1
        total_record_a[game['total_result_a']] += 1
        spread_record_b[game['spread_result_b']] += 1
        total_record_b[game['total_result_b']] += 1
    return spread_record_a, total_record_a, spread_record_b, total_record_b

spread_record_a, total_record_a, spread_record_b, total_record_b = predict_betting_results(merged_df.to_dict('records'))
print("Model A Spread Record (W-L-P):", spread_record_a)
print("Model A Total Record (W-L-P):", total_record_a)
print("Model B Spread Record (W-L-P):", spread_record_b)
print("Model B Total Record (W-L-P):", total_record_b)
    

Model A Spread Record (W-L-P): {'W': 664, 'L': 637, 'P': 13}
Model A Total Record (W-L-P): {'W': 670, 'L': 635, 'P': 9}
Model B Spread Record (W-L-P): {'W': 659, 'L': 642, 'P': 13}
Model B Total Record (W-L-P): {'W': 673, 'L': 632, 'P': 9}


In [49]:
# create a summary betting DF
betting_summary = pd.DataFrame({
    'Model': ['A', 'A', 'B', 'B'],
    'Bet Type': ['Spread', 'Total', 'Spread', 'Total'],
    'Wins': [spread_record_a['W'], total_record_a['W'], spread_record_b['W'], total_record_b['W']],
    'Losses': [spread_record_a['L'], total_record_a['L'], spread_record_b['L'], total_record_b['L']],
    'Pushes': [spread_record_a['P'], total_record_a['P'], spread_record_b['P'], total_record_b['P']],
    'Win Ratio': [
        spread_record_a['W'] / (spread_record_a['W'] + spread_record_a['L']),
        total_record_a['W'] / (total_record_a['W'] + total_record_a['L']),
        spread_record_b['W'] / (spread_record_b['W'] + spread_record_b['L']),
        total_record_b['W'] / (total_record_b['W'] + total_record_b ['L']),
    ]
})
print(betting_summary)

  Model Bet Type  Wins  Losses  Pushes  Win Ratio
0     A   Spread   664     637      13   0.510377
1     A    Total   670     635       9   0.513410
2     B   Spread   659     642      13   0.506533
3     B    Total   673     632       9   0.515709
