# modeling.ipynb

This notebook will handle modeling and data preprocessing for our problem.

Evaluation Metrics:
- Brier Score (this is the main one)
- Accuracy

In [96]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import brier_score_loss, accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from catboost import CatBoostClassifier, Pool, cv
import tensorflow as tf
from tensorflow import keras
from keras import layers
from scikeras.wrappers import KerasClassifier
import pickle

%matplotlib inline

In [2]:
# Getting the training data
training_data = pd.read_csv('/Users/jinalshah/Jinal/Projects/march-madness-mania/preprocessed-data/modeling-data/training.csv',index_col=0)

# Making sure the data loaded correctly
training_data.head()

Unnamed: 0,Season,lower_TeamID,lower_Wins,lower_Losses,lower_Winning Percentage,lower_Score_mean,lower_FGM_mean,lower_FGA_mean,lower_FGM3_mean,lower_FGA3_mean,...,higher_AssistToTurnoverRatio_std,higher_Possessions_std,higher_OffEff_std,higher_DefEff_std,higher_TO%_std,higher_PointDiff_std,higher_OffensiveRating_std,higher_DefensiveRating_std,Bracket,LowerWin?
1057,2019,1242,25,9,0.735294,75.382353,27.294118,59.058824,7.235294,20.647059,...,0.881175,7.48574,25.135399,45.694237,0.13955,12.276641,62672.908339,241.659481,M,1
389,2009,1143,22,10,0.6875,75.03125,27.09375,55.90625,6.34375,14.625,...,0.483462,5.707081,24.857249,37.922101,0.101832,17.022045,56403.384404,586.620957,M,0
462,2010,1352,23,11,0.676471,68.5,23.323529,53.323529,5.647059,15.470588,...,0.435349,5.966921,31.325718,35.567874,0.129745,14.418477,66317.665445,600.246113,M,0
575,2011,1242,32,2,0.941176,82.382353,29.588235,57.617647,7.264706,18.764706,...,0.540804,7.140057,34.3355,38.461804,0.152372,10.118593,49478.637137,459.165885,M,0
559,2011,1228,19,13,0.59375,71.28125,26.34375,56.34375,6.84375,17.6875,...,0.637593,7.101537,35.161898,40.164503,0.164143,15.802057,79118.599885,797.290386,M,0


In [3]:
# Checking to make sure there aren't any missing values
missing_vals = dict(training_data.isna().sum())

# Iterating through the dictionary
for col in missing_vals.keys():
    if missing_vals[col] > 0:
        print(f'Column {col} has {missing_vals[col]} missing values')

## Data Preprocessing

We need to preprocess the data a little bit so let's do that!

Preprocessing that needs to be done:
- Dropping the identifiers (lower_TeamID,higher_TeamID)
- Converting Season into a number for how many seasons back the data is from (-1 = last season, -2 = 2 seasons ago, etc)
- Converting Bracket into dummy variables
- Scaling all numerical values by z-score to gain a normal distribution and all numbers on the same scale.

In [4]:
# Making a copy of the training set 
training_data_preprocessed = training_data.copy()

In [5]:
# Dropping the unnecessary features
training_data_preprocessed.drop(['lower_TeamID','higher_TeamID'],axis=1,inplace=True)

In [6]:
# Converting Season into numbers
training_data_preprocessed['Season_converted'] = training_data_preprocessed['Season'] - 2023.0
training_data_preprocessed.drop(['Season'],axis=1,inplace=True)

In [7]:
# Splitting data into features matrix and target
features = training_data_preprocessed.drop(['LowerWin?'],axis=1)
target = training_data_preprocessed['LowerWin?']

In [8]:
# Splitting data into numerical and categorical
categorical = ['Bracket']
numerical = list(features.columns)
numerical.remove('Bracket')

In [9]:
# Building a pipeline to perform all the appropriate transformations
preprocessing_pipeline = ColumnTransformer(transformers=[
    ('scaler',StandardScaler(with_mean=True,with_std=True),numerical),
    ('encoder',OneHotEncoder(),categorical)
],remainder='passthrough',n_jobs=-1,verbose=True)

In [10]:
# Transforming the feature matrix via the pipeline
features_preprocessed = preprocessing_pipeline.fit_transform(features)

# Making sure fitting happened properly
features_preprocessed

[ColumnTransformer] ....... (2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer] ........ (1 of 2) Processing scaler, total=   0.0s


array([[ 0.23655977,  0.5006731 , -0.28877454, ...,  1.03353159,
         1.        ,  0.        ],
       [-0.49451463,  0.78784489, -0.72021046, ..., -0.86041151,
         1.        ,  0.        ],
       [-0.25082316,  1.07501668, -0.8197726 , ..., -0.6710172 ,
         1.        ,  0.        ],
       ...,
       [ 0.72394271, -0.93518583,  0.93594679, ..., -0.6710172 ,
         0.        ,  1.        ],
       [ 0.96763417, -0.93518583,  0.97234585, ...,  1.03353159,
         0.        ,  1.        ],
       [ 0.96763417, -0.93518583,  0.97234585, ..., -0.48162289,
         0.        ,  1.        ]])

In [11]:
# Data is preprocessed!

## Modeling

Now that we have preprocessed the data for our models, it is time to build our models!

In [12]:
# Creating a dictionary that holds various metrics
metrics = {'Model':[],'Brier Score Training':[],'Mean Brier Score CV':[], 'Accuracy Training':[],'Mean Accuracy CV':[]}

### Dummy Classifier

This is our baseline. This classifier will simply randomly choose a class based on a uniform distribution of the class (0 or 1).
We want our models to ideally be much better than this classifier, hence this is what we compare to.

In [13]:
# Building the dummy classifier
dummy_classifier = DummyClassifier(strategy='uniform',random_state=42)
dummy_classifier.fit(features_preprocessed,target) # Fitting the model

In [14]:
# Getting training scores
train_pred = dummy_classifier.predict(features_preprocessed)
train_probs = dummy_classifier.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Dummy Classifier')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [15]:
# Getting the results and adding it to the dictionary
results = cross_validate(dummy_classifier,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [16]:
# Saving model in a pickle file
pickle.dump(dummy_classifier,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/dummy_classifier.sav','wb'))

### Logisitic Regression

I am going to try a logisitic regression model. I don't expect this model to do too well since the I don't expect linear classification to be the answer to this problem. However, I do want to see how a non-regularized logistic regression model does. Expectation is that this model performs better than the dummy classifier by a significant margin. 

In [17]:
# Building the model
logistic_reg = LogisticRegression(penalty=None,C=1.0,random_state=42,max_iter=1000,n_jobs=-1)
logistic_reg.fit(features_preprocessed,target)

In [18]:
# Getting training scores
train_pred = logistic_reg.predict(features_preprocessed)
train_probs = logistic_reg.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Logistic Regression - No Regularization')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [19]:
# Getting the results and adding it to the dictionary
results = cross_validate(logistic_reg,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [20]:
# Saving model in a pickle file
pickle.dump(logistic_reg,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/logistic_reg_unregularized.sav','wb'))

### Decision Tree

Logistic Regression (Un-Regularized) did better than the dummy classifier. However, it did overfit a lot (which was expected) causing the CV Brier to not improve as much. Decision Trees are a much better model! I do expect overfitting; however, it will still lower the Mean Brier Score CV by a lot, I hope.

In [21]:
# Building the model
decision_tree = DecisionTreeClassifier(criterion='gini',random_state=42)
decision_tree.fit(features_preprocessed,target)

In [22]:
# Getting training scores
train_pred = decision_tree.predict(features_preprocessed)
train_probs = decision_tree.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Decision Tree - UnRegularized')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [23]:
# Getting the results and adding it to the dictionary
results = cross_validate(decision_tree,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [24]:
# Saving model in a pickle file
pickle.dump(decision_tree,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/decision_tree_unregularized.sav','wb'))

### Random Forest

Very interesting! Decision Trees overfit so bad that it performed worse or just marginally better than the dummy classifier. It did excellent on the training set but did so bad on the validation set. 

Let's see how Random Forests do; however, I expect them to do just as bad since its just a bagging of decision trees.

In [25]:
# Building the model
random_forest = RandomForestClassifier(n_estimators=1000,criterion='gini',bootstrap=True,n_jobs=-1,random_state=42,max_samples=0.7)
random_forest.fit(features_preprocessed,target)

In [26]:
# Getting training scores
train_pred = random_forest.predict(features_preprocessed)
train_probs = random_forest.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Random Forest - Basic')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [27]:
# Getting the results and adding it to the dictionary
results = cross_validate(random_forest,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [28]:
# Saving model in a pickle file
pickle.dump(random_forest,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/random_forest_basic.sav','wb'))

### AdaBoost

Interesting, Random Forest performed a lot better. Granted, I did add some regularizars such as boostrapping and portion of training set to look at for each tree. Here, I am going to try AdaBoost. I am going to use AdaBoost for Logistic Regression and Decision Trees because I am curious how it will look.

#### AdaBoost - Logistic Regression

In [29]:
# Building the model
logistic_reg_ada = LogisticRegression(penalty=None,C=1.0,random_state=42,max_iter=1000,n_jobs=-1)
adaboost_logistic = AdaBoostClassifier(estimator=logistic_reg_ada,n_estimators=500,learning_rate=1.5,random_state=42)
adaboost_logistic.fit(features_preprocessed,target)

In [30]:
# Getting training scores
train_pred = adaboost_logistic.predict(features_preprocessed)
train_probs = adaboost_logistic.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Adaboost - Logistic Regression')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [31]:
# Getting the results and adding it to the dictionary
results = cross_validate(adaboost_logistic,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [32]:
# Saving model in a pickle file
pickle.dump(adaboost_logistic,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/adaboost_logistic.sav','wb'))

#### AdaBoost - Decision Trees

In [33]:
# Building the model
decision_tree_ada = DecisionTreeClassifier(criterion='gini',random_state=42)
adaboost_decision_tree = AdaBoostClassifier(estimator=decision_tree_ada,n_estimators=500,learning_rate=1.5,random_state=42)
adaboost_decision_tree.fit(features_preprocessed,target)

In [34]:
# Getting training scores
train_pred = adaboost_decision_tree.predict(features_preprocessed)
train_probs = adaboost_decision_tree.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Adaboost - Decision Tree')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [35]:
# Getting the results and adding it to the dictionary
results = cross_validate(adaboost_decision_tree,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [36]:
# Saving model in a pickle file
pickle.dump(adaboost_decision_tree,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/adaboost_dtree.sav','wb'))

#### Logistic Regression - Elastic Net

I am going to try Logistic Regression with some regularization. I chose ElasticNet since I want some features to be 0'd via L-1 and weights to be small via L-2.

In [37]:
# Building the model
elastic_net = LogisticRegression(penalty='elasticnet',solver='saga',random_state=42,max_iter=5000,n_jobs=-1,l1_ratio=0.7)
elastic_net.fit(features_preprocessed,target)

In [38]:
# Getting training scores
train_pred = elastic_net.predict(features_preprocessed)
train_probs = elastic_net.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Elastic Net - C = 1 and p = 0.7')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [39]:
# Getting the results and adding it to the dictionary
results = cross_validate(elastic_net,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [40]:
# Saving model in a pickle file
pickle.dump(elastic_net,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/elastic_net.sav','wb'))

#### Gradient Boosting

Interesting, Logistic Regression with some regularization doesn't do much better than vanilla logistic regression. Furthermore, Random Forest still beats it. Another model to try is gradient boosting. Let's leverage CatBoost for this one.

In [59]:
# Doing some set-up required for catboost
# Putting the data into a data pool for catboost
training_pool = Pool(data=features_preprocessed,
                     label=target)

In [64]:
# Building the Model
catboost_clf = CatBoostClassifier(iterations=5000,learning_rate=0.05,loss_function='Logloss',random_seed=42,verbose=100,
                                  early_stopping_rounds=5)

# Fitting the model
catboost_clf.fit(training_pool)
plt.show()

0:	learn: 0.6840838	total: 43.5ms	remaining: 3m 37s
100:	learn: 0.4083271	total: 7.56s	remaining: 6m 6s
200:	learn: 0.2717221	total: 11.6s	remaining: 4m 37s
300:	learn: 0.1662506	total: 14.6s	remaining: 3m 48s
400:	learn: 0.1068114	total: 16.8s	remaining: 3m 12s
500:	learn: 0.0717925	total: 19.7s	remaining: 2m 57s
600:	learn: 0.0504900	total: 21.9s	remaining: 2m 40s
700:	learn: 0.0368315	total: 24.9s	remaining: 2m 32s
800:	learn: 0.0275209	total: 27.1s	remaining: 2m 22s
900:	learn: 0.0211331	total: 30.3s	remaining: 2m 17s
1000:	learn: 0.0171264	total: 32.3s	remaining: 2m 8s
1100:	learn: 0.0139857	total: 35.5s	remaining: 2m 5s
1200:	learn: 0.0115468	total: 37.3s	remaining: 1m 57s
1300:	learn: 0.0097986	total: 40.6s	remaining: 1m 55s
1400:	learn: 0.0085618	total: 42.4s	remaining: 1m 49s
1500:	learn: 0.0075087	total: 46s	remaining: 1m 47s
1600:	learn: 0.0067199	total: 48.7s	remaining: 1m 43s
1700:	learn: 0.0060547	total: 58.4s	remaining: 1m 53s
1800:	learn: 0.0055062	total: 1m 5s	remainin

In [65]:
# Getting training scores
train_pred = catboost_clf.predict(features_preprocessed,prediction_type='Class',ntree_start=0,ntree_end=catboost_clf.tree_count_ - 1)
train_probs = catboost_clf.predict_proba(features_preprocessed,ntree_start=0,ntree_end=catboost_clf.tree_count_ - 1)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Catboost - Gradient Boosting')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [67]:
# Setting up for cross validation
params = {'iterations':5000,'learning_rate':0.05,'loss_function':'Logloss',
            'random_seed':42,'verbose':200,
            'custom_metric':['BrierScore','Accuracy']}
scores = cv(training_pool,params,fold_count=5)

Training on fold [0/5]
0:	learn: 0.6835129	test: 0.6873463	best: 0.6873463 (0)	total: 996ms	remaining: 1h 22m 57s
200:	learn: 0.2275412	test: 0.6385977	best: 0.6227853 (84)	total: 7.6s	remaining: 3m 1s
400:	learn: 0.0764722	test: 0.6916963	best: 0.6227853 (84)	total: 12.5s	remaining: 2m 23s
600:	learn: 0.0336975	test: 0.7383267	best: 0.6227853 (84)	total: 17.2s	remaining: 2m 6s
800:	learn: 0.0182806	test: 0.7824567	best: 0.6227853 (84)	total: 22.2s	remaining: 1m 56s
1000:	learn: 0.0114115	test: 0.8082527	best: 0.6227853 (84)	total: 27.1s	remaining: 1m 48s
1200:	learn: 0.0079277	test: 0.8371343	best: 0.6227853 (84)	total: 32.1s	remaining: 1m 41s
1400:	learn: 0.0058967	test: 0.8566901	best: 0.6227853 (84)	total: 37s	remaining: 1m 35s
1600:	learn: 0.0049243	test: 0.8695806	best: 0.6227853 (84)	total: 42s	remaining: 1m 29s
1800:	learn: 0.0044053	test: 0.8769341	best: 0.6227853 (84)	total: 47s	remaining: 1m 23s
2000:	learn: 0.0041397	test: 0.8826590	best: 0.6227853 (84)	total: 52s	remaining

In [71]:
cv_briers = metrics['Mean Brier Score CV']
cv_briers.append(scores.loc[4999,'test-BrierScore-mean'])
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append(scores.loc[4999,'test-Accuracy-mean'])
metrics['Mean Accuracy CV'] = cv_accuracies

In [92]:
# Saving the model
catboost_clf.save_model('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/catboost_vanilla.cbm')

### Neural Network

CatBoost did ok, overfit heavy (which was expected) but this overfitting caused the Brier Score to be worse than a random model. I am going to try one more model, a neural network, before I start testing some ensembles and fine-tune some models.

In [87]:
# Building the neural network
neural_net = keras.Sequential([
    keras.Input(shape=(features_preprocessed.shape[1])), # input layer
    layers.Dense(300,activation='relu'),
    layers.Dense(500,activation='relu'),
    layers.Dense(300,activation='relu'),
    layers.Dense(150,activation='relu'),
    layers.Dense(75,activation='relu'),
    layers.Dense(1,activation='softmax'),
])

neural_net.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 300)               57000     
                                                                 
 dense_25 (Dense)            (None, 500)               150500    
                                                                 
 dense_26 (Dense)            (None, 300)               150300    
                                                                 
 dense_27 (Dense)            (None, 150)               45150     
                                                                 
 dense_28 (Dense)            (None, 75)                11325     
                                                                 
 dense_29 (Dense)            (None, 1)                 76        
                                                                 
Total params: 414,351
Trainable params: 414,351
Non-tr

In [88]:
# Leveragin SciKeras so we can use sklearn functions on the model
neural_net_wrapped = KerasClassifier(model=neural_net,optimizer='adam',loss='binary_crossentropy',random_state=42,batch_size=32,
                                     optimizer__learning_rate=0.03,epochs=500,shuffle=True)

In [89]:
# Fitting the model to the data
neural_net_wrapped.fit(features_preprocessed,target)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [90]:
# Getting training scores
train_pred = neural_net_wrapped.predict(features_preprocessed)
train_probs = neural_net_wrapped.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Deep Neural Network - No Dropout')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies



In [91]:
# Getting the results and adding it to the dictionary
results = cross_validate(neural_net_wrapped,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies



INFO:tensorflow:Assets written to: ram://9d409c26f2894e69ad719541b1821f84/assets


INFO:tensorflow:Assets written to: ram://9d409c26f2894e69ad719541b1821f84/assets
2023-03-15 17:55:31.326177: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://2f828d832fea4a238a8e1ae494f6050c: INVALID_ARGUMENT: ram://2f828d832fea4a238a8e1ae494f6050c is a directory.


INFO:tensorflow:Assets written to: ram://9a258246ebf54954b7fb33dd40281c71/assets


INFO:tensorflow:Assets written to: ram://9a258246ebf54954b7fb33dd40281c71/assets
2023-03-15 17:55:41.524279: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://0ea949435a444ee5b1754df1b720038c: INVALID_ARGUMENT: ram://0ea949435a444ee5b1754df1b720038c is a directory.


INFO:tensorflow:Assets written to: ram://3b32421a84304e59b0c5f15fcf326713/assets


INFO:tensorflow:Assets written to: ram://3b32421a84304e59b0c5f15fcf326713/assets
2023-03-15 17:55:51.646426: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://c5f3e152f17b4f288a9b42015b7945df: INVALID_ARGUMENT: ram://c5f3e152f17b4f288a9b42015b7945df is a directory.


INFO:tensorflow:Assets written to: ram://fa070b03b7de44f9a24968fb43654ade/assets


INFO:tensorflow:Assets written to: ram://fa070b03b7de44f9a24968fb43654ade/assets
2023-03-15 17:56:00.979485: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://60223ccbfc5142b2b0d87d46ff9f7727: INVALID_ARGUMENT: ram://60223ccbfc5142b2b0d87d46ff9f7727 is a directory.


INFO:tensorflow:Assets written to: ram://a19e932387064462b30e016196e1536b/assets


INFO:tensorflow:Assets written to: ram://a19e932387064462b30e016196e1536b/assets


INFO:tensorflow:Assets written to: ram://767a58d3177f4f149bee9c2ac6573fcf/assets


INFO:tensorflow:Assets written to: ram://767a58d3177f4f149bee9c2ac6573fcf/assets
2023-03-15 17:56:13.810096: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-15 17:56:16.980032: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://bdb2c9a9ed7e4cd787739b033edf67f3: INVALID_ARGUMENT: ram://bdb2c9a9ed7e4cd787739b033edf67f3 is a directory.


INFO:tensorflow:Assets written to: ram://81c3232a184040f889fdde5b0826fbf7/assets


INFO:tensorflow:Assets written to: ram://81c3232a184040f889fdde5b0826fbf7/assets
2023-03-15 17:56:24.879082: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Assets written to: ram://3313bcd24cd1418990a60763f8341844/assets


INFO:tensorflow:Assets written to: ram://3313bcd24cd1418990a60763f8341844/assets
2023-03-15 17:56:31.584648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Assets written to: ram://bec6d5e4b0e546039169ba2c16a40527/assets


INFO:tensorflow:Assets written to: ram://bec6d5e4b0e546039169ba2c16a40527/assets
2023-03-15 17:56:39.296448: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-15 17:56:39.357737: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-15 17:56:44.722077: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://de06b84af80b4a67b78d428ddc8363a1: INVALID_ARGUMENT: ram://de06b84af80b4a67b78d428ddc8363a1 is a directory.


Epoch 1/500
INFO:tensorflow:Assets written to: ram://6b13e9b2534d4ec2882d364de12fa440/assets


INFO:tensorflow:Assets written to: ram://6b13e9b2534d4ec2882d364de12fa440/assets
2023-03-15 17:56:48.112476: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500

2023-03-15 17:56:53.803185: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://9cabb6b9d4654ff19ae171b759025157: INVALID_ARGUMENT: ram://9cabb6b9d4654ff19ae171b759025157 is a directory.


Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
 7/41 [====>.........................] - ETA: 0s - loss: 0.6930

2023-03-15 17:57:01.242071: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 32/500
Epoch 2/500
Epoch 33/500
Epoch 3/500
Epoch 34/500
Epoch 4/500
Epoch 5/500
 1/41 [..............................] - ETA: 0s - loss: 0.6944Epoch 36/500
Epoch 37/500
Epoch 6/500
Epoch 38/500
Epoch 7/500
Epoch 39/500
Epoch 8/500
Epoch 40/500
 1/41 [..............................] - ETA: 0s - loss: 0.6996Epoch 9/500
Epoch 41/500
Epoch 10/500
Epoch 11/500
Epoch 42/500
Epoch 43/500
Epoch 12/500
Epoch 44/500
Epoch 13/500
Epoch 45/500
Epoch 14/500
Epoch 46/500
Epoch 15/500
Epoch 47/500
Epoch 16/500
Epoch 48/500
Epoch 17/500
Epoch 18/500
Epoch 50/500
Epoch 19/500
Epoch 20/500
Epoch 52/500
Epoch 21/500
Epoch 22/500
Epoch 53/500
Epoch 54/500
Epoch 23/500
Epoch 55/500

2023-03-15 17:57:12.369352: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://e483d93094d54fbf99def1c503868aaa: INVALID_ARGUMENT: ram://e483d93094d54fbf99def1c503868aaa is a directory.


Epoch 24/500
Epoch 56/500
Epoch 25/500
Epoch 57/500
Epoch 26/500
Epoch 27/500
Epoch 59/500
Epoch 28/500
Epoch 60/500
Epoch 29/500
Epoch 61/500
Epoch 30/500
Epoch 31/500
Epoch 63/500
Epoch 32/500
Epoch 64/500
Epoch 33/500
Epoch 65/500
Epoch 34/500
Epoch 66/500
Epoch 35/500
Epoch 67/500
Epoch 36/500
Epoch 68/500
Epoch 37/500
Epoch 69/500
Epoch 38/500
Epoch 70/500
Epoch 39/500
Epoch 71/500
Epoch 40/500
Epoch 72/500
Epoch 41/500

2023-03-15 17:57:20.345120: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 73/500
Epoch 42/500
Epoch 74/500
Epoch 43/500
Epoch 75/500
Epoch 44/500
Epoch 76/500
Epoch 45/500
Epoch 77/500
Epoch 46/500
Epoch 78/500
Epoch 47/500
Epoch 79/500
Epoch 48/500
Epoch 80/500
Epoch 49/500
Epoch 81/500
Epoch 50/500
Epoch 82/500
Epoch 51/500
Epoch 83/500
Epoch 52/500
Epoch 84/500
Epoch 53/500
Epoch 2/500
Epoch 54/500
Epoch 3/500
Epoch 86/500
Epoch 55/500
Epoch 4/500
Epoch 87/500
Epoch 56/500
Epoch 5/500
Epoch 88/500
Epoch 57/500
Epoch 6/500
Epoch 89/500
Epoch 58/500
Epoch 7/500
Epoch 90/500
Epoch 59/500
Epoch 8/500
Epoch 60/500
Epoch 9/500
Epoch 92/500
Epoch 61/500
Epoch 10/500
Epoch 93/500
Epoch 11/500
Epoch 94/500
Epoch 63/500
Epoch 12/500
Epoch 95/500
Epoch 64/500
Epoch 13/500
Epoch 96/500
Epoch 65/500

2023-03-15 17:57:32.608315: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://6728f75848094c5e84005dc1dd24ff93: INVALID_ARGUMENT: ram://6728f75848094c5e84005dc1dd24ff93 is a directory.


Epoch 14/500
Epoch 97/500
Epoch 66/500
Epoch 15/500
Epoch 98/500
Epoch 67/500
Epoch 16/500
Epoch 99/500
Epoch 68/500
Epoch 17/500
Epoch 100/500
Epoch 69/500
 1/41 [..............................] - ETA: 0s - loss: 0.6949Epoch 1/500
Epoch 18/500
Epoch 101/500
Epoch 70/500
Epoch 102/500
Epoch 71/500
Epoch 20/500
Epoch 103/500
Epoch 72/500
Epoch 21/500
Epoch 104/500
Epoch 73/500
Epoch 22/500
Epoch 105/500
Epoch 74/500
Epoch 23/500
Epoch 106/500
Epoch 75/500
Epoch 24/500
Epoch 107/500
Epoch 76/500
Epoch 25/500
Epoch 108/500
Epoch 77/500
Epoch 26/500
Epoch 109/500
Epoch 27/500
Epoch 79/500
Epoch 28/500
Epoch 111/500
Epoch 80/500
Epoch 29/500
Epoch 112/500
 4/41 [=>............................] - ETA: 0s - loss: 0.6933Epoch 81/500
Epoch 30/500
 6/41 [===>..........................] - ETA: 0s - loss: 0.6940Epoch 113/500
Epoch 82/500
Epoch 31/500
Epoch 114/500
Epoch 83/500
Epoch 32/500
Epoch 115/500
Epoch 84/500
Epoch 33/500
Epoch 116/500
Epoch 85/500
Epoch 117/500
Epoch 86/500
Epoch 35/500
Ep



Epoch 482/500
Epoch 389/500
Epoch 430/500
Epoch 390/500
 1/41 [..............................] - ETA: 0s - loss: 0.6912Epoch 431/500
Epoch 484/500
Epoch 432/500
Epoch 391/500
Epoch 485/500
Epoch 433/500
 1/41 [..............................] - ETA: 0s - loss: 0.6964Epoch 392/500
Epoch 486/500
Epoch 434/500
Epoch 393/500
Epoch 487/500
Epoch 394/500
Epoch 395/500
Epoch 489/500
Epoch 437/500
 5/41 [==>...........................] - ETA: 1s - loss: 0.6940Epoch 396/500
Epoch 490/500
Epoch 438/500
Epoch 397/500
Epoch 491/500
Epoch 439/500
Epoch 398/500
Epoch 492/500
Epoch 440/500
 9/41 [=====>........................] - ETA: 0s - loss: 0.6936Epoch 399/500
Epoch 441/500
Epoch 400/500
Epoch 494/500
Epoch 442/500
Epoch 401/500
Epoch 495/500
Epoch 402/500
Epoch 496/500
Epoch 444/500
Epoch 403/500
Epoch 497/500
Epoch 445/500
Epoch 404/500
Epoch 498/500
Epoch 446/500
Epoch 405/500
Epoch 499/500
Epoch 447/500
Epoch 406/500
Epoch 500/500
Epoch 448/500
Epoch 407/500
Epoch 449/500
Epoch 408/500
Epoch 

2023-03-15 18:01:50.022694: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://ff65a19af8a448d68e25f1fef113c3a5: INVALID_ARGUMENT: ram://ff65a19af8a448d68e25f1fef113c3a5 is a directory.


Epoch 458/500
Epoch 417/500
Epoch 459/500
Epoch 418/500
 6/41 [===>..........................] - ETA: 0s - loss: 0.6931



Epoch 460/500
Epoch 419/500
Epoch 461/500
 1/41 [..............................] - ETA: 0s - loss: 0.6936Epoch 420/500
 8/41 [====>.........................] - ETA: 0s - loss: 0.6934Epoch 1/500
Epoch 462/500
Epoch 421/500
Epoch 463/500
Epoch 422/500
Epoch 464/500
Epoch 423/500
Epoch 465/500
 1/41 [..............................] - ETA: 0s - loss: 0.6931Epoch 424/500
Epoch 466/500
Epoch 425/500
Epoch 467/500
Epoch 426/500
Epoch 468/500
Epoch 427/500
Epoch 469/500
Epoch 428/500
Epoch 470/500
Epoch 429/500
Epoch 471/500
Epoch 430/500
Epoch 472/500
Epoch 431/500
Epoch 473/500
Epoch 432/500
Epoch 474/500
Epoch 433/500
Epoch 475/500
Epoch 434/500
Epoch 435/500
Epoch 477/500
Epoch 436/500
Epoch 478/500
Epoch 437/500
Epoch 438/500
Epoch 480/500
Epoch 439/500
Epoch 2/500
Epoch 481/500
Epoch 440/500
Epoch 3/500
Epoch 482/500
Epoch 441/500
Epoch 4/500
Epoch 483/500
Epoch 442/500
Epoch 5/500
Epoch 484/500
Epoch 443/500
Epoch 485/500
Epoch 6/500
Epoch 444/500
Epoch 486/500
Epoch 7/500
Epoch 445/500



Epoch 32/500
Epoch 470/500
Epoch 33/500
 7/41 [====>.........................] - ETA: 0s - loss: 0.6938Epoch 471/500
Epoch 34/500
Epoch 472/500
Epoch 35/500
Epoch 473/500
Epoch 36/500
Epoch 474/500
Epoch 37/500
Epoch 475/500
Epoch 38/500
 7/41 [====>.........................] - ETA: 0s - loss: 0.6905Epoch 476/500
Epoch 39/500
Epoch 477/500
Epoch 40/500
Epoch 478/500
Epoch 41/500
Epoch 479/500
Epoch 42/500
Epoch 480/500
Epoch 43/500
Epoch 481/500
Epoch 44/500
Epoch 482/500
Epoch 45/500
Epoch 483/500
Epoch 46/500
Epoch 484/500
Epoch 47/500
Epoch 485/500
Epoch 48/500
Epoch 486/500
Epoch 49/500
Epoch 487/500
Epoch 50/500
Epoch 488/500
Epoch 51/500
Epoch 489/500
Epoch 52/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 54/500
Epoch 493/500
Epoch 55/500
Epoch 56/500
Epoch 494/500
Epoch 57/500
 1/41 [..............................] - ETA: 2s - loss: 0.6927Epoch 495/500
Epoch 58/500
Epoch 496/500
Epoch 59/500
Epoch 497/500
Epoch 60/500
 1/41 [..............................] - ETA: 0s - loss



Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142

### Ensemble: Logistic Regression + Random Forest

The Neural Network did pretty bad as well! Based on the scores, looks like logistic regression and random forest are the 2 models worth tuning. Let's combine them to see what would happen.

In [97]:
# Building the model
logistic_reg_stack = LogisticRegression(penalty=None,C=1.0,random_state=42,max_iter=1000,n_jobs=-1)
random_forest_stack = RandomForestClassifier(n_estimators=1000,criterion='gini',bootstrap=True,n_jobs=-1,random_state=42,max_samples=0.7)

# Stacked model
log_forest = VotingClassifier(estimators=[('lr',logistic_reg_stack),('rf',random_forest_stack)],voting='soft',n_jobs=-1)

# Fitting the model
log_forest.fit(features_preprocessed,target)

In [98]:
# Getting training scores
train_pred = log_forest.predict(features_preprocessed)
train_probs = log_forest.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Ensemble 1 - Logistic Regression + Random Forest')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [99]:
# Getting the results and adding it to the dictionary
results = cross_validate(log_forest,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [100]:
# Saving model in a pickle file
pickle.dump(log_forest,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/log_forest_ensemble.sav','wb'))

### Ensemble: Logistic Regression + Random Forest (bit of regularization)

Testing out the ensemble if both models have some regularization

In [102]:
# Building the model
logistic_reg_stack_2 = LogisticRegression(penalty='elasticnet',C=0.3,random_state=42,max_iter=2000,n_jobs=-1,solver='saga',l1_ratio=0.6)
random_forest_stack_2 = RandomForestClassifier(n_estimators=200,criterion='gini',bootstrap=True,n_jobs=-1,random_state=42,max_samples=0.7,
                                             max_depth=7,min_samples_leaf=3,ccp_alpha=0.02)

# Stacked model
log_forest_2 = VotingClassifier(estimators=[('lr',logistic_reg_stack),('rf',random_forest_stack)],voting='soft',n_jobs=-1)

# Fitting the model
log_forest_2.fit(features_preprocessed,target)

In [103]:
# Getting training scores
train_pred = log_forest_2.predict(features_preprocessed)
train_probs = log_forest_2.predict_proba(features_preprocessed)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Ensemble 1 - Logistic Regression + Random Forest with Regularization')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [104]:
# Getting the results and adding it to the dictionary
results = cross_validate(log_forest_2,features_preprocessed,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [105]:
# Saving model in a pickle file
pickle.dump(log_forest_2,open('/Users/jinalshah/Jinal/Projects/march-madness-mania/models/log_forest_ensemble_2.sav','wb'))

In [106]:
# Converting metrics into a dataframe
pd.options.display.float_format = '{:.5f}'.format # Making sure it doesn't display in scientific notation
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,Model,Brier Score Training,Mean Brier Score CV,Accuracy Training,Mean Accuracy CV
0,Dummy Classifier,0.25,0.25,0.51681,0.50124
1,Logistic Regression - No Regularization,0.16506,0.22291,0.76526,0.66005
2,Decision Tree - UnRegularized,0.0,0.41093,1.0,0.58907
3,Random Forest - Basic,0.05107,0.20731,1.0,0.6731
4,Adaboost - Logistic Regression,0.24962,0.24968,0.74533,0.66504
5,Adaboost - Decision Tree,0.0,0.42714,1.0,0.57286
6,Elastic Net - C = 1 and p = 0.7,0.16821,0.21252,0.74658,0.67562
7,Catboost - Gradient Boosting,2e-05,0.26023,1.0,0.65065
8,Deep Neural Network - No Dropout,0.49626,0.49626,0.50374,0.50374
9,Ensemble 1 - Logistic Regression + Random Forest,0.09562,0.20315,0.91407,0.67686


In [107]:
# Saving the results to view
metrics_df.to_csv('/Users/jinalshah/Jinal/Projects/march-madness-mania/results.csv')