In [1]:
# Created by Ian Cox | 2/27/2023
# AHEAD | March Madness 2023 Bracket Challenge
# Tournament Prediction | Advanced classifiers
# Pick winners based on tournament play history,  regular season stats up to the tournament, and seed

# change working dir
import os
os.chdir('C:\\Users\\IanCox\\OneDrive - AHEAD\\Documents\\python\\march_madness\\march-machine-learning-mania-2023')

# load the ml ready data
import pandas as pd
data = pd.read_csv('tourney_ml_input.csv')

# import ml lib
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Split the data into training and test sets
train_data = data[data["Season"] < 2017]
test_data = data[data["Season"] >= 2017]

In [3]:
# Split the features and target variable into X and y keeping the temporal association intact
# Select the features and target variable
features = ["RS Point Total", "RS Avg Points", "OPP RS Point Total", "OPP RS Avg Points", "SeedDiff", "TeamID", "OpponentID"]
target = "outcome"

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

In [4]:
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
    'Keras': Sequential([Dense(10, input_dim=X_train.shape[1], activation='relu'), Dense(1, activation='sigmoid')])
}

In [5]:
# Define hyperparameters to tune for each model
hyperparameters = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Random Forest': {'n_estimators': [50, 100, 150],'max_depth': [3, 5, 7],'min_samples_split': [2, 5, 10]},
    'XGBoost': {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3], 'n_estimators': [50, 100, 200]},
    'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.3], 'algorithm': ['SAMME', 'SAMME.R']},
    'LightGBM': {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3], 'num_leaves': [31, 63, 127]},
    'Keras': {'batch_size': 64, 'epochs': 30, 'optimizer': 'adam', 'activation': 'relu'}
}

In [6]:
def evaluate_models(X_train, y_train, X_test, y_test):
    # Evaluate each model and store the results in a dictionary
    results = {}
    for name, model in models.items():
        if name == 'Keras':
            # Compile and fit the Keras model
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            model.fit(X_train, y_train, validation_data=(X_test, y_test), verbose=0, epochs=hyperparameters[name]['epochs'], batch_size=hyperparameters[name]['batch_size'])
            # Make predictions on test data
            y_pred = (model.predict(X_test) > 0.5).astype(int)
        else:
            # Perform grid search to tune hyperparameters
            clf = GridSearchCV(model, hyperparameters[name], cv=5)
            clf.fit(X_train, y_train)
            # Make predictions on test data
            y_pred = clf.predict(X_test)

        # Calculate accuracy score
        accuracy = accuracy_score(y_test, y_pred)

        # Store the accuracy score for each model in a dictionary
        results[name] = accuracy
    
    # Convert the results dictionary into a pandas dataframe
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy Score']).sort_values(by='Accuracy Score', ascending=False)
    
    return results_df

In [7]:
evaluate_models(X_train, y_train, X_test, y_test)

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\IanCox\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\IanCox\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\IanCox\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.69735324]




Unnamed: 0,Accuracy Score
Random Forest,0.692073
XGBoost,0.692073
Logistic Regression,0.679878
LightGBM,0.679878
AdaBoost,0.676829
Keras,0.628049
