# modeling.ipynb

This notebook will handle modeling and data preprocessing for our problem.

Evaluation Metrics:
- Brier Score (this is the main one)
- Accuracy

In [140]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import brier_score_loss, accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import pickle

%matplotlib inline

In [2]:
# Getting the training data
training_data = pd.read_csv('/Users/jinalshah/Jinal/Projects/march-madness-mania/preprocessed-data/modeling-data/training.csv',index_col=0)

# Making sure the data loaded correctly
training_data.head()

Unnamed: 0,Season,lower_TeamID,lower_Wins,lower_Losses,lower_Winning Percentage,lower_Score_mean,lower_FGM_mean,lower_FGA_mean,lower_FGM3_mean,lower_FGA3_mean,...,higher_AssistToTurnoverRatio_std,higher_Possessions_std,higher_OffEff_std,higher_DefEff_std,higher_TO%_std,higher_PointDiff_std,higher_OffensiveRating_std,higher_DefensiveRating_std,Bracket,LowerWin?
1057,2019,1242,25,9,0.735294,75.382353,27.294118,59.058824,7.235294,20.647059,...,0.881175,7.48574,25.135399,45.694237,0.13955,12.276641,62672.908339,241.659481,M,1
389,2009,1143,22,10,0.6875,75.03125,27.09375,55.90625,6.34375,14.625,...,0.483462,5.707081,24.857249,37.922101,0.101832,17.022045,56403.384404,586.620957,M,0
462,2010,1352,23,11,0.676471,68.5,23.323529,53.323529,5.647059,15.470588,...,0.435349,5.966921,31.325718,35.567874,0.129745,14.418477,66317.665445,600.246113,M,0
575,2011,1242,32,2,0.941176,82.382353,29.588235,57.617647,7.264706,18.764706,...,0.540804,7.140057,34.3355,38.461804,0.152372,10.118593,49478.637137,459.165885,M,0
559,2011,1228,19,13,0.59375,71.28125,26.34375,56.34375,6.84375,17.6875,...,0.637593,7.101537,35.161898,40.164503,0.164143,15.802057,79118.599885,797.290386,M,0


In [3]:
# Checking to make sure there aren't any missing values
missing_vals = dict(training_data.isna().sum())

# Iterating through the dictionary
for col in missing_vals.keys():
    if missing_vals[col] > 0:
        print(f'Column {col} has {missing_vals[col]} missing values')

## Data Preprocessing

We need to preprocess the data a little bit so let's do that!

Preprocessing that needs to be done:
- Dropping the identifiers (lower_TeamID,higher_TeamID)
- Converting Season into a number for how many seasons back the data is from (-1 = last season, -2 = 2 seasons ago, etc)
- Converting Bracket into dummy variables
- Scaling all numerical values by z-score to gain a normal distribution and all numbers on the same scale.

In [4]:
# Making a copy of the training set 
training_data_preprocessed = training_data.copy()

In [5]:
# Dropping the unnecessary features
training_data_preprocessed.drop(['lower_TeamID','higher_TeamID'],axis=1,inplace=True)

In [6]:
# Converting Season into numbers
training_data_preprocessed['Season_converted'] = training_data_preprocessed['Season'] - 2023.0
training_data_preprocessed.drop(['Season'],axis=1,inplace=True)

In [7]:
# Splitting data into features matrix and target
features = training_data_preprocessed.drop(['LowerWin?'],axis=1)
target = training_data_preprocessed['LowerWin?']

In [8]:
# Splitting data into numerical and categorical
categorical = ['Bracket']
numerical = list(features.columns)
numerical.remove('Bracket')

In [9]:
# Building a pipeline to perform all the appropriate transformations
preprocessing_pipeline = ColumnTransformer(transformers=[
    ('scaler',StandardScaler(with_mean=True,with_std=True),numerical),
    ('encoder',OneHotEncoder(),categorical)
],remainder='passthrough',n_jobs=-1,verbose=True)

In [10]:
# Transforming the feature matrix via the pipeline
features_preprocessed = preprocessing_pipeline.fit_transform(features)

# Making sure fitting happened properly
features_preprocessed

[ColumnTransformer] ....... (2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer] ........ (1 of 2) Processing scaler, total=   0.0s


array([[ 0.23655977,  0.5006731 , -0.28877454, ...,  1.03353159,
         1.        ,  0.        ],
       [-0.49451463,  0.78784489, -0.72021046, ..., -0.86041151,
         1.        ,  0.        ],
       [-0.25082316,  1.07501668, -0.8197726 , ..., -0.6710172 ,
         1.        ,  0.        ],
       ...,
       [ 0.72394271, -0.93518583,  0.93594679, ..., -0.6710172 ,
         0.        ,  1.        ],
       [ 0.96763417, -0.93518583,  0.97234585, ...,  1.03353159,
         0.        ,  1.        ],
       [ 0.96763417, -0.93518583,  0.97234585, ..., -0.48162289,
         0.        ,  1.        ]])

In [11]:
# Data is preprocessed!

## Modeling

Now that we have preprocessed the data for our models, it is time to build our models!

In [122]:
# Creating a dictionary that holds various metrics
metrics = {'Model':[],'Brier Score Training':[],'Mean Brier Score CV':[], 'Accuracy Training':[],'Mean Accuracy CV':[]}

### Dummy Classifier

This is our baseline. This classifier will simply randomly choose a class based on a uniform distribution of the class (0 or 1).
We want our models to ideally be much better than this classifier, hence this is what we compare to.

In [123]:
# Building the dummy classifier
dummy_classifier = DummyClassifier(strategy='uniform',random_state=42)
dummy_classifier.fit(features_preprocessed,target) # Fitting the model

In [124]:
# Getting training scores
train_pred = dummy_classifier.predict(features_preprocessed)
train_probs = dummy_classifier.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Dummy Classifier')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [125]:
# Getting the results and adding it to the dictionary
results = cross_validate(dummy_classifier,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [126]:
# Saving model in a pickle file
pickle.dump(dummy_classifier,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/dummy_classifier.sav','wb'))

### Logisitic Regression

I am going to try a logisitic regression model. I don't expect this model to do too well since the I don't expect linear classification to be the answer to this problem. However, I do want to see how a non-regularized logistic regression model does. Expectation is that this model performs better than the dummy classifier by a significant margin. 

In [127]:
# Building the model
logistic_reg = LogisticRegression(penalty=None,C=1.0,random_state=42,max_iter=1000,n_jobs=-1)
logistic_reg.fit(features_preprocessed,target)

In [128]:
# Getting training scores
train_pred = logistic_reg.predict(features_preprocessed)
train_probs = logistic_reg.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Logistic Regression - No Regularization')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [129]:
# Getting the results and adding it to the dictionary
results = cross_validate(logistic_reg,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [130]:
# Saving model in a pickle file
pickle.dump(logistic_reg,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/logistic_reg_unregularized.sav','wb'))

### Decision Tree

Logistic Regression (Un-Regularized) did better than the dummy classifier. However, it did overfit a lot (which was expected) causing the CV Brier to not improve as much. Decision Trees are a much better model! I do expect overfitting; however, it will still lower the Mean Brier Score CV by a lot, I hope.

In [131]:
# Building the model
decision_tree = DecisionTreeClassifier(criterion='gini',random_state=42)
decision_tree.fit(features_preprocessed,target)

In [132]:
# Getting training scores
train_pred = decision_tree.predict(features_preprocessed)
train_probs = decision_tree.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Decision Tree - UnRegularized')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [133]:
# Getting the results and adding it to the dictionary
results = cross_validate(decision_tree,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [134]:
# Saving model in a pickle file
pickle.dump(decision_tree,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/decision_tree_unregularized.sav','wb'))

### Random Forest

Very interesting! Decision Trees overfit so bad that it performed worse or just marginally better than the dummy classifier. It did excellent on the training set but did so bad on the validation set. 

Let's see how Random Forests do; however, I expect them to do just as bad since its just a bagging of decision trees.

In [135]:
# Building the model
random_forest = RandomForestClassifier(n_estimators=1000,criterion='gini',bootstrap=True,n_jobs=-1,random_state=42,max_samples=0.7)
random_forest.fit(features_preprocessed,target)

In [136]:
# Getting training scores
train_pred = random_forest.predict(features_preprocessed)
train_probs = random_forest.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Random Forest - Basic')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [137]:
# Getting the results and adding it to the dictionary
results = cross_validate(random_forest,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [138]:
# Saving model in a pickle file
pickle.dump(random_forest,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/random_forest_basic.sav','wb'))

### AdaBoost

Interesting, Random Forest performed a lot better. Granted, I did add some regularizars such as boostrapping and portion of training set to look at for each tree. Here, I am going to try AdaBoost. I am going to use AdaBoost for Logistic Regression and Decision Trees because I am curious how it will look.

#### AdaBoost - Logistic Regression

In [142]:
# Building the model
logistic_reg_ada = LogisticRegression(penalty=None,C=1.0,random_state=42,max_iter=1000,n_jobs=-1)
adaboost_logistic = AdaBoostClassifier(estimator=logistic_reg_ada,n_estimators=500,learning_rate=1.5,random_state=42)
adaboost_logistic.fit(features_preprocessed,target)

In [143]:
# Getting training scores
train_pred = adaboost_logistic.predict(features_preprocessed)
train_probs = adaboost_logistic.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Adaboost - Logistic Regression')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [144]:
# Getting the results and adding it to the dictionary
results = cross_validate(adaboost_logistic,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [149]:
# Saving model in a pickle file
pickle.dump(adaboost_logistic,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/adaboost_logistic.sav','wb'))

#### AdaBoost - Decision Trees

In [147]:
# Building the model
decision_tree_ada = DecisionTreeClassifier(criterion='gini',random_state=42)
adaboost_decision_tree = AdaBoostClassifier(estimator=decision_tree_ada,n_estimators=500,learning_rate=1.5,random_state=42)
adaboost_decision_tree.fit(features_preprocessed,target)

In [148]:
# Getting training scores
train_pred = adaboost_decision_tree.predict(features_preprocessed)
train_probs = adaboost_decision_tree.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Adaboost - Decision Tree')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [150]:
# Getting the results and adding it to the dictionary
results = cross_validate(adaboost_decision_tree,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [151]:
# Saving model in a pickle file
pickle.dump(adaboost_decision_tree,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/adaboost_dtree.sav','wb'))

In [153]:
# Converting metrics into a dataframe
pd.options.display.float_format = '{:.5f}'.format # Making sure it doesn't display in scientific notation
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,Model,Brier Score Training,Mean Brier Score CV,Accuracy Training,Mean Accuracy CV
0,Dummy Classifier,0.25,0.25,0.51681,0.50124
1,Logistic Regression - No Regularization,0.16506,0.22291,0.76526,0.66005
2,Decision Tree - UnRegularized,0.0,0.41093,1.0,0.58907
3,Random Forest - Basic,0.05107,0.20731,1.0,0.6731
4,Adaboost - Logistic Regression,0.24962,0.24968,0.74533,0.66504
5,Adaboost - Decision Tree,0.0,0.42714,1.0,0.57286
