## Import Libraries

In [99]:
# importing data
import pandas as pd

# train_test_split & cross validation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

# creating piplines
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

# machine learning algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import numpy as np

## Import Data

In [75]:
df = pd.read_pickle('run_pass_df.pkl')

## Prepare Data

In [76]:
# prepare for the train_test_split
X = df.drop('Target', axis=1)
y = df.Target

In [77]:
# perform train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2020, test_size=0.20)

In [78]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((68998, 18), (17250, 18), (68998,), (17250,))

## Making Pipelines

In [None]:
# let's create a pipeline to do all of our preprocessing for us

In [104]:
preprocessing = make_column_transformer((OneHotEncoder(), make_column_selector(dtype_include=object)),
                                       (StandardScaler(), make_column_selector(dtype_include=np.number)))

# SMOTE() -> removed SMOTE from preprocessing pipeline

preprocessing

ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f171e3250>),
                                ('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f171e3af0>)])

In [105]:
# fit and transform our preprocessing pipeline to our training data
preprocessing.fit_transform(X_train)

array([[ 0.90564106,  0.85277944,  1.32850403, ...,  0.8464257 ,
        -1.80343225, -1.5457903 ],
       [ 0.74100312,  0.31975379,  1.0627638 , ..., -0.51012371,
        -0.49756381, -0.42533752],
       [ 0.98796002,  0.60107288, -0.45786081, ...,  2.10607873,
        -1.43032698, -1.62342321],
       ...,
       [ 0.0824514 ,  0.31235066,  1.05907297, ...,  0.74952932,
        -0.49756381, -0.56399903],
       [ 0.98796002, -0.60193639,  0.60325467, ...,  1.04021848,
        -0.77739276, -1.20649511],
       [ 0.78216261, -0.62414579,  0.59218216, ..., -0.21943455,
        -0.77739276, -0.87185151]])

In [106]:
# the next thing we'll do is make separate pipelines for each model we want to test
# each of these pipelines will contain our preprocessing pipeline

dt_pipeline = make_pipeline(preprocessing, DecisionTreeClassifier(random_state=2020))
rf_pipeline = make_pipeline(preprocessing, RandomForestClassifier(random_state=2020))
lr_pipeline = make_pipeline(preprocessing, LogisticRegression(random_state=2020))
et_pipeline = make_pipeline(preprocessing, ExtraTreesClassifier(random_state=2020))
#nb_pipeline = make_pipeline(preprocessing, GaussianNB())
#xgb_pipeline = make_pipeline(preprocessing, xgb.XGBClassifier(random_state=2020))
kn_pipeline = make_pipeline(preprocessing, KNeighborsClassifier())

## Create Param_Grids

In [107]:
# different param_grids for each pipeline

dt_param_grid = {
    'decisiontreeclassifier__criterion': ['entropy', 'gini'],
    'decisiontreeclassifier__splitter': ['best', 'random'],
    'decisiontreeclassifier__max_depth': [2, 5, 10],
    'decisiontreeclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'decisiontreeclassifier__class_weight': ['none', 'balanced']
    
}

rf_param_grid = {
    'randomforestclassifier__n_estimators': [100, 1000, 2000],
    'randomforestclassifier__max_depth': [2, 5, 10]
    
}

lr_param_grid = {
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logisticregression__dual': [True, False],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logisticregression__multi_class': ['auto', 'ovr', 'multinomial'],
    'logisticregression__n_jobs': [10, 20, 30],
    'logisticregression__C': [0.01, 0.1, 0.5]
    
}

et_param_grid = {
    'extratreesclassifier__criterion': ['entropy', 'gini'],
    'extratreesclassifier__max_depth': [2, 5, 10],
    'extratreesclassifier__n_estimators': [100, 250, 500],
    'extratreesclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'extratreesclassifier__class_weight': ['none', 'balanced']
    
}

# nb_param_grid = {
#     'gaussiannb__'
# }

# xgb_param_grid = {
#     #'xgb.xgbclassifier__booster': ['gbtree', 'gblinear', 'dart'],
#     'xgb.xgbclassifier__max_depth': [2, 5, 10],
#     'xgb.xgbclassifier__n_jobs': [10, 20, 30]
#     #'xgb.xgbclassifier__n_estimators': [100, 1000, 2000]
# }

kn_param_grid = {
    'kneighborsclassifier__n_neighbors': [2, 3], # could increase values
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__p': [1, 2]
    
}

## LogisticRegression

In [10]:
search_lr = GridSearchCV(lr_pipeline, lr_param_grid, n_jobs=-1)

search_lr.fit(X_train, y_train)



GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912df0>),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912ac0>)])),
                                       ('logisticregression',
                                        L...20))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.1, 0.5],
                         'logisticregression__dual

In [11]:
# we can check its best parameters
search_lr.best_params_

{'logisticregression__C': 0.01,
 'logisticregression__dual': True,
 'logisticregression__multi_class': 'auto',
 'logisticregression__n_jobs': 10,
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'liblinear'}

In [12]:
# assign best model to a variable using best_estimator_
best_lr_pipeline = search_lr.best_estimator_

In [13]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_lr_cross_val_acc = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='accuracy')
best_lr_cross_val_prec = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='precision')
best_lr_cross_val_rec = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='recall')
best_lr_cross_val_f1 = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='f1')



## DecisionTree

In [14]:
search_dt = GridSearchCV(dt_pipeline, dt_param_grid, n_jobs=-1)

search_dt.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912df0>),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912ac0>)])),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier(random_state=2020))]),
             n_jobs=-1,
             param_grid={'decisiontreeclassifier__class_weight': ['none',
      

In [15]:
# we can check its best parameters
search_dt.best_params_

{'decisiontreeclassifier__class_weight': 'balanced',
 'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 10,
 'decisiontreeclassifier__max_features': 'auto',
 'decisiontreeclassifier__splitter': 'random'}

In [16]:
# assign best model to a variable using best_estimator_
best_dt_pipeline = search_dt.best_estimator_

In [17]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_dt_cross_val_acc = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='accuracy')
best_dt_cross_val_prec = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='precision')
best_dt_cross_val_rec = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='recall')
best_dt_cross_val_f1 = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='f1')

## RandomForest

In [18]:
search_rf = GridSearchCV(rf_pipeline, rf_param_grid, n_jobs=-1)

search_rf.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912df0>),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912ac0>)])),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=2020))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [2, 5, 10],
      

In [19]:
# we can check its best parameters
search_rf.best_params_

{'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__n_estimators': 100}

In [20]:
# assign best model to a variable using best_estimator_
best_rf_pipeline = search_rf.best_estimator_

In [21]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_rf_cross_val_acc = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='accuracy')
best_rf_cross_val_prec = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='precision')
best_rf_cross_val_rec = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='recall')
best_rf_cross_val_f1 = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='f1')

## ExtraTrees

In [22]:
search_et = GridSearchCV(et_pipeline, et_param_grid, n_jobs=-1)

search_et.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912df0>),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f14912ac0>)])),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier(random_state=2020))]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__class_weight': ['none',
            

In [23]:
# we can check its best parameters
search_et.best_params_

{'extratreesclassifier__class_weight': 'balanced',
 'extratreesclassifier__criterion': 'gini',
 'extratreesclassifier__max_depth': 10,
 'extratreesclassifier__max_features': 'auto',
 'extratreesclassifier__n_estimators': 500}

In [24]:
# assign best model to a variable using best_estimator_
best_et_pipeline = search_et.best_estimator_

In [26]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_et_cross_val_acc = cross_val_score(best_et_pipeline, X_train, y_train, scoring='accuracy')
best_et_cross_val_prec = cross_val_score(best_et_pipeline, X_train, y_train, scoring='precision')
best_et_cross_val_rec = cross_val_score(best_et_pipeline, X_train, y_train, scoring='recall')
best_et_cross_val_f1 = cross_val_score(best_et_pipeline, X_train, y_train, scoring='f1')

## KNeighbors

In [108]:
search_kn = GridSearchCV(kn_pipeline, kn_param_grid, n_jobs=-1)

search_kn.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f171e3250>),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f8f171e3af0>)])),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'kneighborsclassifier__n_neighbors': [2, 3],
                         'kneig

In [109]:
# we can check its best parameters
search_kn.best_params_

{'kneighborsclassifier__n_neighbors': 3,
 'kneighborsclassifier__p': 1,
 'kneighborsclassifier__weights': 'uniform'}

In [110]:
# assign best model to a variable using best_estimator_
best_kn_pipeline = search_kn.best_estimator_

In [111]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_kn_cross_val_acc = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='accuracy')
best_kn_cross_val_prec = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='precision')
best_kn_cross_val_rec = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='recall')
best_kn_cross_val_f1 = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='f1')

## Gaussian Naive Bayes

In [117]:
gaussian = GaussianNB()
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
gaussian.fit(X_train_sc, y_train)
y_hat_train_gnb = gaussian.predict(X_train_sc)

In [118]:
gnb_acc = accuracy_score(y_train, y_hat_train_gnb)
gnb_prec = precision_score(y_train, y_hat_train_gnb)
gnb_rec = recall_score(y_train, y_hat_train_gnb)
gnb_f1 = f1_score(y_train, y_hat_train_gnb)

## Comparison of Models

In [120]:
models = pd.DataFrame({
    'Model': ['LogisticRegression', 'DecisionTree', 'RandomForest', 
              'ExtraTrees', 'KNeighbors', 'Gaussian Naive Bayes'], 
    'Accuracy': [best_lr_cross_val_acc.mean(), best_dt_cross_val_acc.mean(), best_rf_cross_val_acc.mean(), 
               best_et_cross_val_acc.mean(), best_kn_cross_val_acc.mean(), gnb_acc.mean()], 
    'Precision': [best_lr_cross_val_prec.mean(), best_dt_cross_val_prec.mean(), best_rf_cross_val_prec.mean(), 
               best_et_cross_val_prec.mean(), best_kn_cross_val_prec.mean(), gnb_prec.mean()], 
    'Recall': [best_lr_cross_val_rec.mean(), best_dt_cross_val_rec.mean(), best_rf_cross_val_rec.mean(), 
               best_et_cross_val_rec.mean(), best_kn_cross_val_rec.mean(), gnb_rec.mean()],
    'F1 Score': [best_lr_cross_val_f1.mean(), best_dt_cross_val_f1.mean(), best_rf_cross_val_f1.mean(), 
               best_et_cross_val_f1.mean(), best_kn_cross_val_f1.mean(), gnb_f1.mean()]}) 

In [121]:
models.sort_values('Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,LogisticRegression,0.737456,0.767157,0.832603,0.798536
3,ExtraTrees,0.734702,0.790678,0.782694,0.786654
1,DecisionTree,0.733021,0.775938,0.805376,0.790366
2,RandomForest,0.72718,0.710705,0.950347,0.813229
4,KNeighbors,0.715079,0.767412,0.780723,0.774
5,Gaussian Naive Bayes,0.706136,0.794194,0.715068,0.752557


In [122]:
models.sort_values('Precision', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
5,Gaussian Naive Bayes,0.706136,0.794194,0.715068,0.752557
3,ExtraTrees,0.734702,0.790678,0.782694,0.786654
1,DecisionTree,0.733021,0.775938,0.805376,0.790366
4,KNeighbors,0.715079,0.767412,0.780723,0.774
0,LogisticRegression,0.737456,0.767157,0.832603,0.798536
2,RandomForest,0.72718,0.710705,0.950347,0.813229


In [123]:
models.sort_values('F1 Score', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
2,RandomForest,0.72718,0.710705,0.950347,0.813229
0,LogisticRegression,0.737456,0.767157,0.832603,0.798536
1,DecisionTree,0.733021,0.775938,0.805376,0.790366
3,ExtraTrees,0.734702,0.790678,0.782694,0.786654
4,KNeighbors,0.715079,0.767412,0.780723,0.774
5,Gaussian Naive Bayes,0.706136,0.794194,0.715068,0.752557


In [124]:
models.sort_values('Recall', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
2,RandomForest,0.72718,0.710705,0.950347,0.813229
0,LogisticRegression,0.737456,0.767157,0.832603,0.798536
1,DecisionTree,0.733021,0.775938,0.805376,0.790366
3,ExtraTrees,0.734702,0.790678,0.782694,0.786654
4,KNeighbors,0.715079,0.767412,0.780723,0.774
5,Gaussian Naive Bayes,0.706136,0.794194,0.715068,0.752557
