## 1. Load Imports and Processed Data

In [1]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.paths import PROCESSED_DIR

games = pd.read_parquet(PROCESSED_DIR / "processed_games.parquet")
games.head()

Unnamed: 0,date,start_et,away_team,home_team,attend,arena,season,source_file,home_win,home_win_pct_10,...,home_season_win_pct,home_recent_win_pct_20g,home_days_rest,home_last_pd,away_win_pct_10,away_avg_pd_10,away_season_win_pct,away_recent_win_pct_20g,away_days_rest,away_last_pd
0,2015-10-29,7:00p,Memphis Grizzlies,Indiana Pacers,18165.0,Bankers Life Fieldhouse,2015-16_NBA,oct.xls,0,0.0,...,0.0,0.0,1.0,-1.0,0.0,-1.0,0.0,0.0,1.0,-1.0
1,2015-10-29,8:00p,Atlanta Hawks,New York Knicks,19812.0,Madison Square Garden (IV),2015-16_NBA,oct.xls,0,1.0,...,1.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,2.0,-1.0
2,2015-10-29,10:30p,Dallas Mavericks,Los Angeles Clippers,19218.0,STAPLES Center,2015-16_NBA,oct.xls,1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,2015-10-30,10:30p,Portland Trail Blazers,Phoenix Suns,18055.0,Talking Stick Resort Arena,2015-16_NBA,oct.xls,1,0.0,...,0.0,0.0,2.0,-1.0,1.0,1.0,1.0,1.0,2.0,1.0
4,2015-10-30,10:00p,Los Angeles Lakers,Sacramento Kings,17391.0,Sleep Train Arena,2015-16_NBA,oct.xls,1,0.0,...,0.0,0.0,2.0,-1.0,0.0,-1.0,0.0,0.0,2.0,-1.0


## 2. Add New Features

### Rebuild Team-Games Table (per team, per game)

In [2]:


home_df = games[["date", "home_team", "away_team", "home_win"]].copy()
home_df["team"] = home_df["home_team"]
home_df["opponents"] = home_df["away_team"]
home_df["is_home"] = 1
home_df["win"] = home_df["home_win"]

away_df = games[["date", "away_team", "home_team", "home_win"]].copy()
away_df["team"] = away_df["away_team"]
away_df["opponent"] = away_df["home_team"]
away_df["is_home"] = 0
away_df["win"] = 1 - away_df["home_win"]

team_games = pd.concat([home_df, away_df], ignore_index=True)
team_games = team_games.sort_values(["team", "date"]).reset_index(drop=True)

team_games.head()

Unnamed: 0,date,home_team,away_team,home_win,team,opponents,is_home,win,opponent
0,2015-10-29,New York Knicks,Atlanta Hawks,0,Atlanta Hawks,,0,1,New York Knicks
1,2015-10-30,Atlanta Hawks,Charlotte Hornets,1,Atlanta Hawks,Charlotte Hornets,1,1,
2,2015-11-01,Charlotte Hornets,Atlanta Hawks,0,Atlanta Hawks,,0,1,Charlotte Hornets
3,2015-11-03,Miami Heat,Atlanta Hawks,0,Atlanta Hawks,,0,1,Miami Heat
4,2015-11-04,Atlanta Hawks,Brooklyn Nets,1,Atlanta Hawks,Brooklyn Nets,1,1,


### Compute Rest Days and Back-to-Back Flag

In [3]:
# Previous game date per team
team_games["prev_date"] = team_games.groupby("team")["date"].shift(1)

# Days of rest since last game
team_games["days_rest"] = (team_games["date"] - team_games["prev_date"]).dt.days
team_games["days_rest"] = team_games["days_rest"].fillna(7)  # assume 7 for first game

# Back-to-back indicator: played yesterday
team_games["is_b2b"] = (team_games["days_rest"] == 1).astype(int)

team_games[["team", "date", "prev_date", "days_rest", "is_b2b"]].head()

Unnamed: 0,team,date,prev_date,days_rest,is_b2b
0,Atlanta Hawks,2015-10-29,NaT,7.0,0
1,Atlanta Hawks,2015-10-30,2015-10-29,1.0,1
2,Atlanta Hawks,2015-11-01,2015-10-30,2.0,0
3,Atlanta Hawks,2015-11-03,2015-11-01,2.0,0
4,Atlanta Hawks,2015-11-04,2015-11-03,1.0,1


### Merge Home/Away B2B Flags Back Into `games`

In [4]:
# Home B2B
home_b2b = (
    team_games[team_games["is_home"] == 1][["date", "team", "is_b2b"]]
    .rename(columns={"is_b2b": "home_b2b"})
)

games = games.merge(
    home_b2b,
    left_on=["date", "home_team"],
    right_on=["date", "team"],
    how="left",
)

# Away B2B
away_b2b = (
    team_games[team_games["is_home"] == 0][["date", "team", "is_b2b"]]
    .rename(columns={"is_b2b": "away_b2b"})
)

games = games.merge(
    away_b2b,
    left_on=["date", "away_team"],
    right_on=["date", "team"],
    how="left",
)

# Clean helper columns
games = games.drop(columns=["team_x", "team_y"], errors="ignore")

games[["date", "home_team", "away_team", "home_b2b", "away_b2b"]].head()

Unnamed: 0,date,home_team,away_team,home_b2b,away_b2b
0,2015-10-29,Indiana Pacers,Memphis Grizzlies,0,0
1,2015-10-29,New York Knicks,Atlanta Hawks,0,0
2,2015-10-29,Los Angeles Clippers,Dallas Mavericks,0,0
3,2015-10-30,Phoenix Suns,Portland Trail Blazers,0,0
4,2015-10-30,Sacramento Kings,Los Angeles Lakers,0,0


### Add B2B Features to List

In [5]:
feature_cols = [
    # Rolling performance 
    "home_win_pct_10", "away_win_pct_10",
    "home_avg_pd_10", "away_avg_pd_10",

    # Seasonal cumulative strength 
    "home_season_win_pct", "away_season_win_pct",

    # Recent from last 20 games
    "home_recent_win_pct_20g", "away_recent_win_pct_20g",

    # Last game point differential 
    "home_last_pd", "away_last_pd",

    # Back-to-back flags
    "home_b2b", "away_b2b"
]

### Rebuild X, y, and Train/Test Split

In [6]:
games_model = games.dropna(subset=feature_cols)

X = games_model[feature_cols]
y = games_model["home_win"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=y
)

### Retrain Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

log_reg_b2b = LogisticRegression(max_iter=1000)
log_reg_b2b.fit(X_train, y_train)

preds_lr_b2b = log_reg_b2b.predict(X_test)
proba_lr_b2b = log_reg_b2b.predict_proba(X_test)[:,1]

acc_lr_b2b = accuracy_score(y_test, preds_lr_b2b)
auc_lr_b2b = roc_auc_score(y_test, proba_lr_b2b)

acc_lr_b2b, auc_lr_b2b

(0.631624674196351, 0.667127448263119)

#### **Logistic Regression + B2B Results**

After adding back-to-back (B2B) fatigue indicators for both home and away teams, model performance improved:

- **Accuracy:** 0.632  
- **ROC AUC:** 0.667  

**Summary:**  
B2B status adds meaningful predictive signal. The updated logistic regression model now achieves the highest AUC so far and sets a new benchmark for all future models.
****

## Random Forest with B2B

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_b2b = RandomForestClassifier(
    n_estimators=300,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

rf_b2b.fit(X_train, y_train)

preds_rf_b2b = rf_b2b.predict(X_test)
proba_rf_b2b = rf_b2b.predict_proba(X_test)[:,1]

acc_rf_b2b = accuracy_score(y_test, preds_rf_b2b)
auc_rf_b2b = roc_auc_score(y_test, proba_rf_b2b)

acc_rf_b2b, auc_rf_b2b

(0.6177237185056472, 0.6517341863759547)

## XGBoost with B2B

In [9]:
from xgboost import XGBClassifier

xgb_b2b = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_b2b.fit(X_train, y_train)

preds_xgb_b2b = xgb_b2b.predict(X_test)
proba_xgb_b2b = xgb_b2b.predict_proba(X_test)[:,1]

acc_xgb_b2b = accuracy_score(y_test, preds_xgb_b2b)
auc_xgb_b2b = roc_auc_score(y_test, proba_xgb_b2b)

acc_xgb_b2b, auc_xgb_b2b

(0.6277150304083405, 0.6510482107661986)

## Results Table 

In [10]:
results_b2b = pd.DataFrame([
    {"model": "log_reg_b2b", "acc": acc_lr_b2b, "auc": auc_lr_b2b},
    {"model": "rf_b2b", "acc": acc_rf_b2b, "auc": auc_rf_b2b},
    {"model": "xgb_b2b", "acc": acc_xgb_b2b, "auc": auc_xgb_b2b},
])

results_b2b.sort_values("auc", ascending=False)

Unnamed: 0,model,acc,auc
0,log_reg_b2b,0.631625,0.667127
1,rf_b2b,0.617724,0.651734
2,xgb_b2b,0.627715,0.651048


### **Model Comparison with B2B Features**

| Model | Accuracy | AUC |
|-------|----------|------|
| Logistic Regression (B2B) | **0.632** | **0.667** |
| Random Forest (B2B) | 0.618 | 0.652 |
| XGBoost (B2B) | 0.628 | 0.651 |

**Summary:**  
Logistic Regression continues to be the top-performing model, even after adding back-to-back fatigue features. The linear model still captures the signal most effectively, while tree-based models show no improvement over earlier versions. This confirms that well-designed rolling and contextual features remain the strongest predictors in the current pipeline.