In [19]:
# Import/install packages
import numpy as np
import pandas as pd

In [20]:
# Authenticate Google Colab and access Google Sheets
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default

# Authorize access to Google Sheets
creds, _ = default()
gc = gspread.authorize(creds)

# Open the Google Sheet containing betting model data
worksheet = gc.open('Betting Model Data').sheet1

# Load data from Google Sheets into a pandas DataFrame
rows = worksheet.get_all_values()  # Get all rows from the sheet
BetData = pd.DataFrame.from_records(rows)  # Convert rows to a DataFrame

In [21]:
# Rename columns for better readability
BetData.columns = [
    'Tm-A', 'Wk', 'MLodds-A', 'MLodds-H', 'SPRodds-A', 'SPRodds-H',
    'SPRline', 'Uodds', 'Oodds', 'OUline', '%betsML-A', '%betsSPR-A',
    '%betsOVER', 'Score-A', 'Score-H'
]

# Drop the first row (likely header data) and convert remaining columns to numeric
BetData = BetData.drop([0])
BetData = BetData.iloc[:, 1:].apply(pd.to_numeric)

# Basic data evaluation
print(BetData.isnull().sum())  # Check for missing values in the dataset
print(BetData.describe(percentiles=None).T.iloc[:, [0, 1, 2, 3, 7]])  # Display summary statistics

Wk            0
MLodds-A      0
MLodds-H      0
SPRodds-A     0
SPRodds-H     0
SPRline       0
Uodds         0
Oodds         0
OUline        0
%betsML-A     0
%betsSPR-A    0
%betsOVER     0
Score-A       0
Score-H       0
dtype: int64
            count        mean         std     min     max
Wk           74.0    4.972973    1.433197    3.00    7.00
MLodds-A     74.0   31.378378  190.623038 -425.00  390.00
MLodds-H     74.0  -79.216216  197.194172 -520.00  330.00
SPRodds-A    74.0 -104.621622   35.589633 -122.00  110.00
SPRodds-H    74.0 -104.162162   34.637468 -122.00  102.00
SPRline      74.0    3.689189    2.224864    0.50    9.50
Uodds        74.0 -109.851351    2.513974 -120.00 -100.00
Oodds        74.0 -109.986486    2.242145 -115.00 -100.00
OUline       74.0   42.945946    5.608560   15.50   51.50
%betsML-A    74.0    0.500811    0.268730    0.03    0.94
%betsSPR-A   74.0    0.541351    0.192598    0.13    0.87
%betsOVER    74.0    0.572432    0.135715    0.22    0.88
Score-A  

In [22]:
# Add new derived features for analysis
# 'Win-H' indicates if the home team won (1) or lost (0)
BetData['Win-H'] = (BetData['Score-A'] < BetData['Score-H']).astype(int)

# 'OverHit' checks if the combined score exceeded the over/under line
BetData['OverHit'] = ((BetData['Score-A'] + BetData['Score-H']) > BetData['OUline']).astype(int)

# Add betting percentage metrics for home teams
BetData['Avg%Bet - H'] = 1 - (round(((BetData['%betsML-A'] + BetData['%betsSPR-A']) / 2), 2))
BetData['%betsML-H'] = 1 - BetData['%betsML-A']
BetData['%betsSPR-H'] = 1 - BetData['%betsSPR-A']

# Add binary flags for high and medium betting percentages
BetData['HighBet-ML-H'] = (BetData['%betsML-H'] > .67).astype(int)
BetData['HighBet-SPR-H'] = (BetData['%betsSPR-H'] > .67).astype(int)
BetData['MidBet-ML-H'] = ((BetData['%betsML-H'] > .33) & (BetData['%betsML-H'] < .67)).astype(int)
BetData['MidBet-SPR-H'] = ((BetData['%betsSPR-H'] > .33) & (BetData['%betsSPR-H'] < .67)).astype(int)



In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Split the dataset into features and target variable
X = BetData[['MLodds-A', 'MLodds-H', 'SPRodds-A', 'SPRodds-H', 'SPRline', 'OUline']]
y = BetData['Win-H']  # Target variable: whether the home team won

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up a Random Forest model with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Train the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and the best model
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Random Forest Model Accuracy: {accuracy:.2f}")

# Output the best parameters
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Optimized Random Forest Model Accuracy: 0.67
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Define features and target variable
X = BetData[['MLodds-A', 'MLodds-H', 'SPRodds-A', 'SPRodds-H', 'SPRline', 'OUline']]
y = BetData['Win-H']

# Feature engineering (adding interaction terms)
X['SPRline_OUline'] = X['SPRline'] * X['OUline']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_model = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings sampled
    cv=3,  # 3-fold cross-validation
    scoring='f1',  # Optimize for F1-score
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Train the optimized model
random_search.fit(X_train, y_train)

# Best model and parameters
best_rf_model = random_search.best_estimator_

# Predictions
y_pred = best_rf_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Improved Random Forest Model Accuracy: {accuracy:.2f}")
print(f"Improved Random Forest Model F1-Score: {f1:.2f}")
print(f"Best Parameters: {random_search.best_params_}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['SPRline_OUline'] = X['SPRline'] * X['OUline']


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Improved Random Forest Model Accuracy: 0.67
Improved Random Forest Model F1-Score: 0.67
Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': False}
