In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('oddsData.csv')

# Create target: 1 if team beat spread, 0 otherwise
df['point_diff'] = df['score'] - df['opponentScore']
df['covered_spread'] = (df['point_diff'] > df['spread']).astype(int)

# Convert date to datetime format (replace 'date' if your column name is different)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')  # sort to ensure win/loss history is chronological

df.head()


Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,spread,secondHalfTotal,point_diff,covered_spread
0,2007-10-30,2008,Utah,@,Golden State,117,96,100.0,-120.0,212.0,1.0,105.5,21,1
1,2007-10-30,2008,LA Lakers,vs,Houston,93,95,190.0,-230.0,199.0,5.0,99.0,-2,0
2,2007-10-30,2008,Houston,@,LA Lakers,95,93,-230.0,190.0,199.0,-5.0,99.0,2,1
3,2007-10-30,2008,San Antonio,vs,Portland,106,97,-1400.0,900.0,189.5,-13.0,95.0,9,1
4,2007-10-30,2008,Portland,@,San Antonio,97,106,900.0,-1400.0,189.5,13.0,95.0,-9,0


In [2]:
# Track team wins and losses up to each game
win_counts = {}
loss_counts = {}
team_wins = []
team_losses = []

for _, row in df.iterrows():
    team = row['team']
    team_score = row['score']
    opp_score = row['opponentScore']

    # Get current record
    wins = win_counts.get(team, 0)
    losses = loss_counts.get(team, 0)

    # Store record before this game
    team_wins.append(wins)
    team_losses.append(losses)

    # Update record after the game
    if team_score > opp_score:
        win_counts[team] = wins + 1
    else:
        loss_counts[team] = losses + 1

# Add to dataframe
df['team_wins'] = team_wins
df['team_losses'] = team_losses


In [3]:
# Track opponent wins and losses up to each game
opp_win_counts = {}
opp_loss_counts = {}
opp_wins = []
opp_losses = []

for _, row in df.iterrows():
    opp = row['opponent']
    team_score = row['score']
    opp_score = row['opponentScore']

    # Get opponent's current record
    wins = opp_win_counts.get(opp, 0)
    losses = opp_loss_counts.get(opp, 0)

    # Store record before this game
    opp_wins.append(wins)
    opp_losses.append(losses)

    # Update opponent's record after this game
    if opp_score > team_score:
        opp_win_counts[opp] = wins + 1
    else:
        opp_loss_counts[opp] = losses + 1

# Add to dataframe
df['opponent_wins'] = opp_wins
df['opponent_losses'] = opp_losses


In [4]:
# Select the features for training
features = [
    'spread',
    'moneyLine',
    'total',
    'team_wins',
    'team_losses',
    'opponent_wins',
    'opponent_losses'
]

# Set up input features (X) and target (y)
X = df[features]
y = df['covered_spread']


In [5]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train XGBoost model
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    learning_rate=0.1,
    max_depth=5,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.774828190270853

Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78      3801
           1       0.77      0.77      0.77      3620

    accuracy                           0.77      7421
   macro avg       0.77      0.77      0.77      7421
weighted avg       0.77      0.77      0.77      7421



In [6]:
import joblib
joblib.dump(model, 'xgboost_betting_model.pkl')

['xgboost_betting_model.pkl']