In [1]:
# Created by Ian Cox | 2/27/2023
# AHEAD | March Madness 2023 Bracket Challenge
# Tournament Prediction | Advanced classifiers
# Pick winners based on tournament play history,  regular season stats up to the tournament, and seed

# change working dir
import os
os.chdir('C:\\Users\\IanCox\\OneDrive - AHEAD\\Documents\\python\\march_madness\\march-machine-learning-mania-2023_final')

# load the ml ready data
import pandas as pd
data = pd.read_csv('tourney_ml_input.csv')

# import ml lib
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Split the data into training and test sets
train_data = data[data["Season"] < 2017]
test_data = data[data["Season"] >= 2017]

In [3]:
# Split the features and target variable into X and y keeping the temporal association intact
# Select the features and target variable
features = ["RS Point Total", "RS Avg Points", "OPP RS Point Total", "OPP RS Avg Points", "SeedDiff", "TeamID", "OpponentID"]
target = "outcome"

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

In [4]:
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
    'Keras': Sequential([Dense(10, input_dim=X_train.shape[1], activation='relu'), Dense(1, activation='sigmoid')])
}

In [5]:
# Define hyperparameters to tune for each model
hyperparameters = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Random Forest': {'n_estimators': [50, 100, 150],'max_depth': [3, 5, 7],'min_samples_split': [2, 5, 10]},
    'XGBoost': {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3], 'n_estimators': [50, 100, 200]},
    'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.3], 'algorithm': ['SAMME', 'SAMME.R']},
    'LightGBM': {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3], 'num_leaves': [31, 63, 127]},
    'Keras': {'batch_size': 64, 'epochs': 30, 'optimizer': 'adam', 'activation': 'relu'}
}

In [12]:
# modifying the function so it also returns the best scoring model
import joblib

def evaluate_models(X_train, y_train, X_test, y_test):
    # Evaluate each model and store the results in a dictionary
    results = {}
    best_model = None
    best_accuracy = 0
    for name, model in models.items():
        if name == 'Keras':
            # Compile and fit the Keras model
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            model.fit(X_train, y_train, validation_data=(X_test, y_test), verbose=0, epochs=hyperparameters[name]['epochs'], batch_size=hyperparameters[name]['batch_size'])
            # Make predictions on test data
            y_pred = (model.predict(X_test) > 0.5).astype(int)
        else:
            # Perform grid search to tune hyperparameters
            clf = GridSearchCV(model, hyperparameters[name], cv=5)
            clf.fit(X_train, y_train)
            # Make predictions on test data
            y_pred = clf.predict(X_test)

        # Calculate accuracy score
        accuracy = accuracy_score(y_test, y_pred)

        # Store the accuracy score for each model in a dictionary
        results[name] = accuracy
        
        # Update the best model if the current model has better accuracy
        if accuracy > best_accuracy:
            best_model = clf.best_estimator_ if name != 'Keras' else model
            best_accuracy = accuracy
    
    # Convert the results dictionary into a pandas dataframe
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy Score']).sort_values(by='Accuracy Score', ascending=False)
    
    # Export the best model using joblib
    joblib.dump(best_model, 'best_model.joblib')
    
    return best_model, results_df

In [13]:
best_model, results_df  = evaluate_models(X_train, y_train, X_test, y_test)

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\IanCox\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\IanCox\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\IanCox\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.69735324]




In [14]:
results_df

Unnamed: 0,Accuracy Score
XGBoost,0.692073
Logistic Regression,0.679878
LightGBM,0.679878
Random Forest,0.676829
AdaBoost,0.676829
Keras,0.634146


In [11]:
# great! the best model is now exported and we can score new data with this:

# Load the best model object
best_model = joblib.load('best_model.joblib')

# Score new data
y_pred_new = best_model.predict(X_new)

In [95]:
# lets read in a file and generate preds!
score_this = pd.read_csv('6TH_ROUND_SCORING_FORM.csv')
score_this

Unnamed: 0,TeamID,OpponentID,SeedDiff,Season,RS Point Total,RS Avg Points,OPP RS Point Total,OPP RS Avg Points
0,1345,1242,0,2023,2485.0,73.088235,2548.0,74.941176
1,1242,1345,0,2023,2548.0,74.941176,2485.0,73.088235


In [96]:
score2 = score_this.drop(['Season'], axis=1)

In [97]:
# define the desired column order
columns = ["RS Point Total", "RS Avg Points", "OPP RS Point Total", "OPP RS Avg Points", "SeedDiff", "TeamID", "OpponentID"]

# reorder the columns using reindex
score3 = score2.reindex(columns=columns)

In [98]:
# Load the best model object
best_model = joblib.load('best_model.joblib')

In [99]:
# Generate predictions on the input data using the loaded model
predictions = best_model.predict(score3)

In [100]:
# Add the predictions as a new column to the input data
score4 = score3.assign(prediction=predictions)
score4

Unnamed: 0,RS Point Total,RS Avg Points,OPP RS Point Total,OPP RS Avg Points,SeedDiff,TeamID,OpponentID,prediction
0,2485.0,73.088235,2548.0,74.941176,0,1345,1242,1
1,2548.0,74.941176,2485.0,73.088235,0,1242,1345,0


In [101]:
# Load the saved model
best_model = joblib.load('best_model.joblib')

# Make predictions and get probabilities
y_pred = best_model.predict(score3)
y_prob = best_model.predict_proba(score3)[:, 1]

score4 = score3

# Add the prediction and probability columns to the original data
score4['prediction'] = y_pred
score4['probability'] = y_prob

In [102]:
score4

Unnamed: 0,RS Point Total,RS Avg Points,OPP RS Point Total,OPP RS Avg Points,SeedDiff,TeamID,OpponentID,prediction,probability
0,2485.0,73.088235,2548.0,74.941176,0,1345,1242,1,0.533633
1,2548.0,74.941176,2485.0,73.088235,0,1242,1345,0,0.471054


In [104]:
# join the team names to the file so its easier to read
# read in teams data
teams = pd.read_csv('MTeams.csv')

# merge in the team names
name_merge_winners = pd.merge(score4, teams[['TeamID', 'TeamName']], on='TeamID')
name_merge_winners

Unnamed: 0,RS Point Total,RS Avg Points,OPP RS Point Total,OPP RS Avg Points,SeedDiff,TeamID,OpponentID,prediction,probability,TeamName
0,2485.0,73.088235,2548.0,74.941176,0,1345,1242,1,0.533633,Purdue
1,2548.0,74.941176,2485.0,73.088235,0,1242,1345,0,0.471054,Kansas


In [105]:
# drop values of teams that didn't win
winners_round1 = name_merge_winners.loc[score4["prediction"] == 1]


winners_round1.to_csv('SIXTH_round_preds.csv',index=False)

In [106]:
print(winners_round1.head(50))

   RS Point Total  RS Avg Points  OPP RS Point Total  OPP RS Avg Points  \
0          2485.0      73.088235              2548.0          74.941176   

   SeedDiff  TeamID  OpponentID  prediction  probability TeamName  
0         0    1345        1242           1     0.533633   Purdue  


## This notebook takes the seed data and regular season data and submits that layout for scoring. The scoring file returns the prediction for winners as well as the probabilities output for each outcome per matchup.