In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("spreadspoke_scores.csv")

# Drop rows with missing needed data
df = df.dropna(subset=["team_favorite_id", "spread_favorite", "score_home", "score_away", "over_under_line", "stadium", "schedule_date"])

# Create favorite/underdog scores
df["favorite_score"] = df.apply(
    lambda row: row["score_home"] if row["team_favorite_id"] == row["team_home"] else row["score_away"], axis=1
)
df["underdog_score"] = df.apply(
    lambda row: row["score_away"] if row["team_favorite_id"] == row["team_home"] else row["score_home"], axis=1
)

# Define if favorite covered the spread
df["covered"] = (df["favorite_score"] - df["underdog_score"]) > df["spread_favorite"]

# Create opponent column
df["opponent_team_id"] = df.apply(
    lambda row: row["team_home"] if row["team_favorite_id"] != row["team_home"] else row["team_away"], axis=1
)

# Build win/loss records
team_records = {}

def get_record(team):
    record = team_records.get(team, {"wins": 0, "losses": 0})
    return record["wins"], record["losses"]

wins = []
losses = []
opp_wins = []
opp_losses = []

for _, row in df.iterrows():
    fav = row["team_favorite_id"]
    opp = row["opponent_team_id"]
    
    # Pull records before game
    fw, fl = get_record(fav)
    ow, ol = get_record(opp)
    
    wins.append(fw)
    losses.append(fl)
    opp_wins.append(ow)
    opp_losses.append(ol)
    
    # Update records *after* the game
    if row["favorite_score"] > row["underdog_score"]:
        team_records[fav] = {"wins": fw + 1, "losses": fl}
        team_records[opp] = {"wins": ow, "losses": ol + 1}
    else:
        team_records[fav] = {"wins": fw, "losses": fl + 1}
        team_records[opp] = {"wins": ow + 1, "losses": ol}

# Add pregame records to DataFrame
df["team_wins"] = wins
df["team_losses"] = losses
df["opponent_wins"] = opp_wins
df["opponent_losses"] = opp_losses

# Drop rows with missing values in any feature we need
df_model = df.dropna(subset=["spread_favorite", "over_under_line", "team_wins", "team_losses", "opponent_wins", "opponent_losses", "covered"])

# Final model features and target
X = df_model[["spread_favorite", "over_under_line", "team_wins", "team_losses", "opponent_wins", "opponent_losses"]]
y = df_model["covered"].astype(int)

print("Model shape:", X.shape)
X.head()


Model shape: (11597, 6)


Unnamed: 0,spread_favorite,over_under_line,team_wins,team_losses,opponent_wins,opponent_losses
350,-13.5,43,0,0,0,0
538,-18.0,40,0,0,0,0
727,-12.0,39,0,0,0,0
916,-2.5,36,1,0,0,1
1105,-6.0,34,0,0,0,0


In [3]:
# Force numeric types on all features to prevent object dtype issues
X = X.apply(pd.to_numeric, errors='coerce')


In [4]:
X = X.dropna()
y = y.loc[X.index]  # Keep labels in sync after dropping

In [5]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pickle
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

with open("xgboost_betting_model_nfl.pkl", "wb") as f:
    pickle.dump(model, f)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.48      0.39      0.43       998
           1       0.59      0.68      0.63      1309

    accuracy                           0.55      2307
   macro avg       0.54      0.53      0.53      2307
weighted avg       0.54      0.55      0.54      2307



In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Rebuild features (you already have df_model)
X = df_model[["spread_favorite", "over_under_line", "team_wins", "team_losses", "opponent_wins", "opponent_losses"]].copy()
y = df_model["covered"].astype(int)

# Convert all to numeric to avoid object dtype issues
X = X.apply(pd.to_numeric, errors='coerce')

# Add derived features
X["team_win_pct"] = X["team_wins"] / (X["team_wins"] + X["team_losses"] + 1)
X["opponent_win_pct"] = X["opponent_wins"] / (X["opponent_wins"] + X["opponent_losses"] + 1)
X["spread_strength"] = abs(X["spread_favorite"])

# Drop rows with any remaining NaNs
X = X.dropna()
y = y.loc[X.index]

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save model
with open("xgboost_betting_model_nfl.pkl", "wb") as f:
    pickle.dump(model, f)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.45      0.37      0.40       998
           1       0.58      0.66      0.62      1309

    accuracy                           0.53      2307
   macro avg       0.51      0.51      0.51      2307
weighted avg       0.52      0.53      0.52      2307



In [7]:
X["is_home_favorite"] = (df_model["team_favorite_id"] == df_model["team_home"]).astype(int)

In [9]:
# Convert to numeric, non-numeric (like "Superbowl") become NaN
df_model["schedule_week"] = pd.to_numeric(df_model["schedule_week"], errors="coerce")

# Option 1: Fill NaNs with 0 or another placeholder
df_model["schedule_week"] = df_model["schedule_week"].fillna(0).astype(int)

# OR Option 2: Drop rows without numeric weeks
# df_model = df_model.dropna(subset=["schedule_week"])
# df_model["schedule_week"] = df_model["schedule_week"].astype(int)

# Then reattach to X
X["season"] = pd.to_datetime(df_model["schedule_date"]).dt.year
X["week"] = df_model["schedule_week"]


In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Start from existing feature base
X = df_model[["spread_favorite", "over_under_line", "team_wins", "team_losses", "opponent_wins", "opponent_losses"]].copy()
y = df_model["covered"].astype(int)

# Ensure numerics
X = X.apply(pd.to_numeric, errors='coerce')

# --- Derived Features ---
X["team_win_pct"] = X["team_wins"] / (X["team_wins"] + X["team_losses"] + 1)
X["opponent_win_pct"] = X["opponent_wins"] / (X["opponent_wins"] + X["opponent_losses"] + 1)
X["spread_strength"] = abs(X["spread_favorite"])

# --- Time Context Features ---
df_model["schedule_week"] = pd.to_numeric(df_model["schedule_week"], errors="coerce").fillna(0).astype(int)
X["season"] = pd.to_datetime(df_model["schedule_date"]).dt.year
X["week"] = df_model["schedule_week"]

# --- Situational Feature ---
X["is_home_favorite"] = (df_model["team_favorite_id"] == df_model["team_home"]).astype(int)

# Final cleanup
X = X.dropna()
y = y.loc[X.index]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save model
with open("xgboost_betting_model_nfl.pkl", "wb") as f:
    pickle.dump(model, f)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.47      0.39      0.43       998
           1       0.59      0.67      0.63      1309

    accuracy                           0.55      2307
   macro avg       0.53      0.53      0.53      2307
weighted avg       0.54      0.55      0.54      2307

