In [1]:
import pandas as pd
df = pd.read_csv(
'/Users/mavinjames/Projects/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/merged_dataset.csv')

In [2]:
cols = [
    "ft_pct","opp_ft_pct","fg_pct","fg3a","fg3_pct","fg","fga","fg2a","fg3","ft","fta",
    "efg_pct","fg2_pct","orb","drb","opp_fg3","opp_fg_pct","blk","opp_fg","pf","tov",
    "ast","stl","trb","team_game_result","opp_efg_pct","opp_ft","opp_fta","opp_orb",
    "opp_drb","opp_trb","opp_ast","opp_stl","opp_blk","opp_tov","opp_fg3a","opp_fg3_pct",
    "opp_fg2","opp_fg2a","opp_fg2_pct","opp_fga","team_possessions","opp_possessions",
    "opp_pf","possessions","off_rtg","def_rtg","win","fg2","net_rtg",
    "team_game_num_season","opp_opp_team_game_score","team_game_score","score_diff",
    "opp_team_game_score"
]

df = df.dropna(subset=cols)

In [3]:
features = ['is_Home','cum_off_rtg',
 'cum_def_rtg',
 'cum_net_rtg',
 'win_pct_last_10',
 'efg_pct_last_5',
 'efg_pct_last_10',
 'avg_fta_last_5',
 'avg_fta_last_10',
 'avg_ast_last_5',
 'avg_ast_last_10',
 'avg_trb_last_5',
 'avg_trb_last_10',
 'avg_orb_last_5',
 'avg_orb_last_10',
 'avg_tov_last_5',
 'avg_tov_last_10',
 'avg_team_game_score_last_5',
 'avg_team_game_score_last_10',
 'avg_opp_team_game_score_last_5',
 'avg_opp_team_game_score_last_10',
 'avg_score_diff_last_5',
 'avg_score_diff_last_10',
 'avg_possessions_last_5',
 'avg_possessions_last_10',
 'rest_days',
 'opp_rest_days',
 'opp_win_pct_last_10',
 'opp_efg_pct_last_5',
 'opp_efg_pct_last_10',
 'opp_avg_fta_last_5',
 'opp_avg_fta_last_10',
 'opp_avg_ast_last_5',
 'opp_avg_ast_last_10',
 'opp_avg_trb_last_5',
 'opp_avg_trb_last_10',
 'opp_avg_orb_last_5',
 'opp_avg_orb_last_10',
 'opp_avg_tov_last_5',
 'opp_avg_tov_last_10',
 'opp_avg_team_game_score_last_5',
 'opp_avg_team_game_score_last_10',
 'opp_avg_opp_team_game_score_last_5',
 'opp_avg_opp_team_game_score_last_10',
 'opp_avg_score_diff_last_5',
 'opp_avg_score_diff_last_10',
 'opp_avg_possessions_last_5',
 'opp_avg_possessions_last_10',
 'opp_cum_off_rtg',
 'opp_cum_def_rtg',
 'opp_cum_net_rtg',
 'avg_score_comp_last_10',
 'efg_comp_last_10',
 'avg_tov_comp_last_10',
 'avg_orb_comp_last_10',
 'avg_fta_comp_last_10',
 'rest_days_comp', 'net_rtg_comp',
 'home_road_split_comp', "pace_mismatch_signed"]  # your cols

target = "score_diff"

corrs = df[features + [target]].corr()[target].sort_values(ascending=False)
corrs

score_diff                    1.000000
net_rtg_comp                  0.526028
is_Home                       0.349765
avg_score_diff_last_10        0.336407
cum_net_rtg                   0.330177
                                ...   
opp_avg_ast_last_10          -0.170005
opp_win_pct_last_10          -0.283551
opp_avg_score_diff_last_5    -0.313047
opp_cum_net_rtg              -0.330177
opp_avg_score_diff_last_10   -0.336407
Name: score_diff, Length: 61, dtype: float64

In [4]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# =========================================================
# CONFIG
# =========================================================
K = 20                     # residual cap
HCA_ALPHA = 5.0            # ridge strength for baseline
SEASONS_TRAIN = [2023, 2024, 2025]
SEASON_TEST = 2026

df["net_rtg_home_interaction"] = df["net_rtg_comp"] * df["is_Home"]

# =========================================================
# SPLIT DATA
# =========================================================
train_df = df[df["season"].isin(SEASONS_TRAIN)].copy()
test_df  = df[df["season"] == SEASON_TEST].copy()

# =========================================================
# MODEL 1: BASELINE / EXPECTED MARGIN MODEL
# =========================================================
baseline_features = [
    "net_rtg_comp",
    "is_Home",
    "net_rtg_home_interaction"
    # optionally add: "season_progress"
]

Xb_train = train_df[baseline_features]
yb_train = train_df["score_diff"].astype(float)

mask_b = Xb_train.notna().all(axis=1) & yb_train.notna()
Xb_train, yb_train = Xb_train[mask_b], yb_train[mask_b]

baseline_model = Ridge(alpha=HCA_ALPHA)
baseline_model.fit(Xb_train, yb_train)

# =========================================================
# BUILD EXPECTED MARGIN
# =========================================================
train_df["expected_margin"] = baseline_model.predict(
    train_df[baseline_features]
)

test_df["expected_margin"] = baseline_model.predict(
    test_df[baseline_features]
)

# =========================================================
# MODEL 2: RESIDUAL / ADJUSTMENT MODEL
# =========================================================
residual_features = [
    "avg_score_comp_last_10",
    "efg_comp_last_10",
    "avg_tov_comp_last_10",
    "pace_mismatch_signed",
    "avg_orb_comp_last_10",
    "avg_fta_comp_last_10",
    "rest_days_comp",
    "home_road_split_comp",
]

# Residual target
train_df["residual_margin"] = (
    train_df["score_diff"] - train_df["expected_margin"]
)
train_df["residual_margin_capped"] = train_df["residual_margin"].clip(-K, K)

Xr_train = train_df[residual_features]
yr_train = train_df["residual_margin_capped"].astype(float)

mask_r = Xr_train.notna().all(axis=1) & yr_train.notna()
Xr_train, yr_train = Xr_train[mask_r], yr_train[mask_r]

residual_model = XGBRegressor(
    n_estimators=600,
    max_depth=3,
    learning_rate=0.04,
    subsample=0.75,
    colsample_bytree=0.75,
    min_child_weight=20,
    reg_alpha=1.0,
    reg_lambda=3.0,
    objective="reg:pseudohubererror",
    random_state=42,
)

residual_model.fit(Xr_train, yr_train)

# =========================================================
# PREDICTION (TEST SET)
# =========================================================
Xr_test = test_df[residual_features]
y_test  = test_df["score_diff"].astype(float)

mask_test = Xr_test.notna().all(axis=1) & y_test.notna()
Xr_test, y_test = Xr_test[mask_test], y_test[mask_test]
expected_test = test_df.loc[mask_test, "expected_margin"]

pred_residual = residual_model.predict(Xr_test)
pred_final = expected_test.values + pred_residual

# =========================================================
# EVALUATION
# =========================================================
test_eval = test_df.loc[mask_test].copy()
test_eval["pred_final"] = pred_final
test_eval["abs_score_diff"] = test_eval["score_diff"].abs()

low_mask = test_eval["abs_score_diff"] < 18
high_mask = ~low_mask

print("MAE Low (<18):",
      mean_absolute_error(
          test_eval.loc[low_mask, "score_diff"],
          test_eval.loc[low_mask, "pred_final"]
      ))

print("MAE High (>=18):",
      mean_absolute_error(
          test_eval.loc[high_mask, "score_diff"],
          test_eval.loc[high_mask, "pred_final"]
      ))

print("RMSE:",
      np.sqrt(mean_squared_error(y_test, pred_final)))

print("R2:",
      r2_score(y_test, pred_final))


MAE Low (<18): 7.917737052334856
MAE High (>=18): 17.079342105352364
RMSE: 13.385973066033133
R2: 0.4097328859850269


In [5]:
features_2026 = pd.read_csv("/Users/mavinjames/Projects/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/2026/features_2026.csv")
features_2026["date"] = pd.to_datetime(features_2026["date"])
features_2026["net_rtg_home_interaction"] = df["net_rtg_comp"] * df["is_Home"]

features_2026["pred_baseline"] = baseline_model.predict(features_2026[baseline_features])
features_2026["pred_residual"] = residual_model.predict(features_2026[residual_features])
features_2026['pred_final'] = features_2026["pred_baseline"] + features_2026["pred_residual"]

favored_df = features_2026[features_2026["pred_final"] > 0].copy()

# dedupe to one row per game
favored_df["team_a"] = favored_df[["school_name","opp_name_abbr"]].min(axis=1)
favored_df["team_b"] = favored_df[["school_name","opp_name_abbr"]].max(axis=1)
favored_df = favored_df.drop_duplicates(["date","team_a","team_b"])

predictions_df_fix = favored_df[["date","school_name","opp_name_abbr","pred_final", "pred_baseline", "pred_residual", "is_Home","team_game_score", "opp_team_game_score", "score_diff"]]
predictions_df_fix[predictions_df_fix["date"] == '2026-01-29']

Unnamed: 0,date,school_name,opp_name_abbr,pred_final,pred_baseline,pred_residual,is_Home,team_game_score,opp_team_game_score,score_diff
135,2026-01-29,Albany (NY),NJIT,3.628077,5.003092,-1.375015,1.0,68.0,77.0,-9.0
488,2026-01-29,Binghamton,Bryant,0.684075,4.125826,-3.441751,1.0,63.0,60.0,3.0
756,2026-01-29,Cal State Bakersfield,Cal Poly,2.523709,3.615093,-1.091385,1.0,79.0,104.0,-25.0
798,2026-01-29,Cal State Northridge,UC Davis,1.151068,2.209273,-1.058205,1.0,94.0,78.0,16.0
862,2026-01-29,Campbell,Stony Brook,1.603575,1.649303,-0.045728,1.0,69.0,81.0,-12.0
961,2026-01-29,Charleston Southern,Radford,5.195458,5.992549,-0.797091,1.0,75.0,84.0,-9.0
1496,2026-01-29,Drexel,Hampton,4.565988,5.660853,-1.094864,1.0,58.0,51.0,7.0
1579,2026-01-29,East Tennessee State,Western Carolina,14.622505,14.626226,-0.003721,1.0,88.0,90.0,-2.0
1703,2026-01-29,Elon,William & Mary,2.855716,2.669415,0.186301,1.0,79.0,76.0,3.0
1824,2026-01-29,Florida Atlantic,Memphis,0.157109,-1.096287,1.253396,0.0,65.0,92.0,-27.0


In [6]:
predictions_df_fix.to_csv('/Users/mavinjames/Projects/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/predictions.csv')