# modeling_with_pca.ipynb

The best validation accuracy is around 68%. I suspect this is a consequence of having roughly 190 features. Let me perform the same modeling, but this time I run the data through the PCA dimensionality reduction algorithm to see if there are any improvements.

In [94]:
# Importing libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import brier_score_loss, accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

%matplotlib inline

In [23]:
# Getting the training data
training_data = pd.read_csv('/Users/jinalshah/Jinal/Projects/march-madness-mania/preprocessed-data/modeling-data/training.csv',index_col=0)

# Making sure the data loaded correctly
training_data.head()

Unnamed: 0,Season,lower_TeamID,lower_Wins,lower_Losses,lower_Winning Percentage,lower_Score_mean,lower_FGM_mean,lower_FGA_mean,lower_FGM3_mean,lower_FGA3_mean,...,higher_AssistToTurnoverRatio_std,higher_Possessions_std,higher_OffEff_std,higher_DefEff_std,higher_TO%_std,higher_PointDiff_std,higher_OffensiveRating_std,higher_DefensiveRating_std,Bracket,LowerWin?
1057,2019,1242,25,9,0.73529,75.38235,27.29412,59.05882,7.23529,20.64706,...,0.88118,7.48574,25.1354,45.69424,0.13955,12.27664,62672.90834,241.65948,M,1
389,2009,1143,22,10,0.6875,75.03125,27.09375,55.90625,6.34375,14.625,...,0.48346,5.70708,24.85725,37.9221,0.10183,17.02204,56403.3844,586.62096,M,0
462,2010,1352,23,11,0.67647,68.5,23.32353,53.32353,5.64706,15.47059,...,0.43535,5.96692,31.32572,35.56787,0.12975,14.41848,66317.66544,600.24611,M,0
575,2011,1242,32,2,0.94118,82.38235,29.58824,57.61765,7.26471,18.76471,...,0.5408,7.14006,34.3355,38.4618,0.15237,10.11859,49478.63714,459.16589,M,0
559,2011,1228,19,13,0.59375,71.28125,26.34375,56.34375,6.84375,17.6875,...,0.63759,7.10154,35.1619,40.1645,0.16414,15.80206,79118.59988,797.29039,M,0


## Data Preprocessing

We need to preprocess the data a little bit so let's do that!

Preprocessing that needs to be done:
- Dropping the identifiers (lower_TeamID,higher_TeamID)
- Converting Season into a number for how many seasons back the data is from (-1 = last season, -2 = 2 seasons ago, etc)
- Converting Bracket into dummy variables
- Scaling all numerical values by z-score to gain a normal distribution and all numbers on the same scale.

(Will convert to script later)

In [24]:
# Making a copy of the training set 
training_data_preprocessed = training_data.copy()

# Dropping the unnecessary features
training_data_preprocessed.drop(['lower_TeamID','higher_TeamID'],axis=1,inplace=True)

# Converting Season into numbers
training_data_preprocessed['Season_converted'] = training_data_preprocessed['Season'] - 2023.0
training_data_preprocessed.drop(['Season'],axis=1,inplace=True)

# Splitting data into features matrix and target
features = training_data_preprocessed.drop(['LowerWin?'],axis=1)
target = training_data_preprocessed['LowerWin?']

# Splitting data into numerical and categorical
categorical = ['Bracket']
numerical = list(features.columns)
numerical.remove('Bracket')

In [25]:
# Building a pipeline to perform all the appropriate transformations
preprocessing_pipeline = ColumnTransformer(transformers=[
    ('scaler',StandardScaler(with_mean=True,with_std=True),numerical),
    ('encoder',OneHotEncoder(),categorical)
],remainder='passthrough',n_jobs=-1,verbose=True)

In [26]:
# Transforming the feature matrix via the pipeline
features_preprocessed = preprocessing_pipeline.fit_transform(features)

# Making sure fitting happened properly
features_preprocessed

[ColumnTransformer] ....... (2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer] ........ (1 of 2) Processing scaler, total=   0.0s


array([[ 0.23655977,  0.5006731 , -0.28877454, ...,  1.03353159,
         1.        ,  0.        ],
       [-0.49451463,  0.78784489, -0.72021046, ..., -0.86041151,
         1.        ,  0.        ],
       [-0.25082316,  1.07501668, -0.8197726 , ..., -0.6710172 ,
         1.        ,  0.        ],
       ...,
       [ 0.72394271, -0.93518583,  0.93594679, ..., -0.6710172 ,
         0.        ,  1.        ],
       [ 0.96763417, -0.93518583,  0.97234585, ...,  1.03353159,
         0.        ,  1.        ],
       [ 0.96763417, -0.93518583,  0.97234585, ..., -0.48162289,
         0.        ,  1.        ]])

In [27]:
# Dimensions before PCA
features_preprocessed.shape

(1606, 189)

In [71]:
# Running the data through PCA to reduce the dimensions
pca = PCA(n_components=100,random_state=42)
features_preprocessed_pca = pca.fit_transform(features_preprocessed)

In [72]:
# Dimensions after PCA
features_preprocessed_pca.shape

(1606, 100)

In [73]:
# Performing a test using logistic regression to see if PCA helps
# log_reg = LogisticRegression(penalty='None',random_state=42,max_iter=1000,n_jobs=-1)
log_reg = LogisticRegression(penalty='l2',C=0.5,random_state=42,max_iter=1000,n_jobs=-1)
log_reg.fit(features_preprocessed_pca,target)

In [74]:
# Creating a dictionary that holds various metrics
metrics = {'Model':[],'Brier Score Training':[],'Mean Brier Score CV':[], 'Accuracy Training':[],'Mean Accuracy CV':[]}

In [75]:
# Getting training scores
train_pred = log_reg.predict(features_preprocessed_pca)
train_probs = log_reg.predict_proba(features_preprocessed_pca)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Logistic Regression')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [76]:
# Getting the results and adding it to the dictionary
results = cross_validate(log_reg,features_preprocessed_pca,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [77]:
# Performing a test using random forest to see if PCA helps
forest = RandomForestClassifier(n_estimators=1000,criterion='gini',n_jobs=-1,max_samples=0.7)
forest.fit(features_preprocessed_pca,target)

In [78]:
# Getting training scores
train_pred = forest.predict(features_preprocessed_pca)
train_probs = forest.predict_proba(features_preprocessed_pca)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Random Forest')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [79]:
# Getting the results and adding it to the dictionary
results = cross_validate(forest,features_preprocessed_pca,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [83]:
res = cross_validate(log_reg,features_preprocessed_pca,target,scoring=['neg_log_loss'],
                         cv=5,n_jobs=-1)
(-1 * res['test_neg_log_loss']).mean()

0.5977446072277257

In [84]:
res = cross_validate(forest,features_preprocessed_pca,target,scoring=['neg_log_loss'],
                         cv=5,n_jobs=-1)
(-1 * res['test_neg_log_loss']).mean()

0.6429430259338592

In [90]:
# Messing around and trying with a Decision Tree
tree = DecisionTreeClassifier(max_depth=15,random_state=42)
tree.fit(features_preprocessed_pca,target)

In [91]:
# Getting training scores
train_pred = tree.predict(features_preprocessed_pca)
train_probs = tree.predict_proba(features_preprocessed_pca)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Decision Tree')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [92]:
# Getting the results and adding it to the dictionary
results = cross_validate(tree,features_preprocessed_pca,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [97]:
svc = SVC(C=0.5,kernel='poly',degree=10,random_state=42,probability=True)
svc.fit(features_preprocessed_pca,target)

In [98]:
# Getting training scores
train_pred = svc.predict(features_preprocessed_pca)
train_probs = svc.predict_proba(features_preprocessed_pca)[:,1]

# Getting the metrics
train_brier = brier_score_loss(target,train_probs)
train_accuracy = accuracy_score(target,train_pred)

# Adding metrics to the dictionary
model_list = metrics['Model']
model_list.append('Support Vector Classifier')
metrics['Model'] = model_list
train_briers = metrics['Brier Score Training']
train_briers.append(train_brier)
metrics['Brier Score Training'] = train_briers
train_accuracies = metrics['Accuracy Training']
train_accuracies.append(train_accuracy)
metrics['Accuracy Training'] = train_accuracies

In [99]:
# Getting the results and adding it to the dictionary
results = cross_validate(svc,features_preprocessed_pca,target,scoring=['accuracy','neg_brier_score'],
                         cv=5,n_jobs=-1)

cv_briers = metrics['Mean Brier Score CV']
cv_briers.append((-1*results['test_neg_brier_score']).mean())
metrics['Mean Brier Score CV'] = cv_briers
cv_accuracies = metrics['Mean Accuracy CV']
cv_accuracies.append((results['test_accuracy']).mean())
metrics['Mean Accuracy CV'] = cv_accuracies

In [100]:
# Converting metrics into a dataframe
pd.options.display.float_format = '{:.5f}'.format # Making sure it doesn't display in scientific notation
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,Model,Brier Score Training,Mean Brier Score CV,Accuracy Training,Mean Accuracy CV
0,Logistic Regression,0.17628,0.20576,0.72976,0.67439
1,Random Forest,0.05549,0.22528,1.0,0.66254
2,Decision Tree,0.0,0.4097,1.0,0.5903
3,Decision Tree,0.01959,0.40761,0.97509,0.58593
4,Decision Tree,0.24337,0.24915,0.73537,0.50872
