In [1]:
import pandas as pd

# Load the data
df = pd.read_csv("oddsData.csv")  # Make sure the filename matches exactly

# Preview it
df.head()

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,spread,secondHalfTotal
0,2007-10-30,2008,Utah,@,Golden State,117,96,100.0,-120.0,212.0,1.0,105.5
1,2007-10-30,2008,LA Lakers,vs,Houston,93,95,190.0,-230.0,199.0,5.0,99.0
2,2007-10-30,2008,Houston,@,LA Lakers,95,93,-230.0,190.0,199.0,-5.0,99.0
3,2007-10-30,2008,San Antonio,vs,Portland,106,97,-1400.0,900.0,189.5,-13.0,95.0
4,2007-10-30,2008,Portland,@,San Antonio,97,106,900.0,-1400.0,189.5,13.0,95.0


In [2]:
# Calculate how much the team won or lost by
df['point_diff'] = df['score'] - df['opponentScore']

# Create a yes/no column: 1 if the team covered the spread, 0 if not
df['covered_spread'] = (df['point_diff'] > df['spread']).astype(int)

# Preview it
df[['team', 'score', 'opponent', 'opponentScore', 'spread', 'point_diff', 'covered_spread']].head()


Unnamed: 0,team,score,opponent,opponentScore,spread,point_diff,covered_spread
0,Utah,117,Golden State,96,1.0,21,1
1,LA Lakers,93,Houston,95,5.0,-2,0
2,Houston,95,LA Lakers,93,-5.0,2,1
3,San Antonio,106,Portland,97,-13.0,9,1
4,Portland,97,San Antonio,106,13.0,-9,0


In [3]:
# Choose input features and the target
features = ['spread', 'moneyLine', 'total']  # We'll expand this later
X = df[features]  # input variables
y = df['covered_spread']  # what we want to predict (yes or no)

In [4]:
from sklearn.model_selection import train_test_split

# Split into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on test data
y_pred = model.predict(X_test)

# Check accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))


Accuracy: 0.725508691551004

Report:
               precision    recall  f1-score   support

           0       0.72      0.74      0.73      3708
           1       0.73      0.72      0.72      3713

    accuracy                           0.73      7421
   macro avg       0.73      0.73      0.73      7421
weighted avg       0.73      0.73      0.73      7421



In [6]:
import joblib

# Save the model to a file
joblib.dump(model, 'betting_model.pkl')


['betting_model.pkl']

In [7]:
df['home_game'] = (df['home/visitor'] == 'vs').astype(int)

In [8]:
df['spread_abs'] = df['spread'].abs()

In [9]:
df['moneyline_diff'] = df['moneyLine'] - df['opponentMoneyLine']

In [10]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
from xgboost import XGBClassifier

model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    learning_rate=0.1,
    max_depth=5,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))

Accuracy: 0.779814041234335

Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78      3708
           1       0.78      0.78      0.78      3713

    accuracy                           0.78      7421
   macro avg       0.78      0.78      0.78      7421
weighted avg       0.78      0.78      0.78      7421



In [14]:
import joblib
joblib.dump(model, 'xgboost_betting_model.pkl')

['xgboost_betting_model.pkl']

In [15]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

rf = RandomForestClassifier()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [16]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb)],
    voting='soft'  # uses predicted probabilities instead of hard labels
)

ensemble.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
y_pred = ensemble.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))


Accuracy: 0.7560975609756098

Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76      3708
           1       0.76      0.75      0.76      3713

    accuracy                           0.76      7421
   macro avg       0.76      0.76      0.76      7421
weighted avg       0.76      0.76      0.76      7421

