## Import Libraries

In [None]:
# importing data
import pandas as pd

# train_test_split & cross validation
from sklearn.model_selection import train_test_split, cross_val_score

# creating piplines
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

# creating models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier


import numpy as np

In [None]:
import pandas as pd

In [None]:
!pip install sklearn

## Import Data

In [None]:
df = pd.read_pickle('run_pass_df.pkl')

## Prepare Data

In [None]:
# prepare for the train_test_split
X = df.drop('Target', axis=1)
y = df.Target

In [None]:
# perform train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2020, test_size=0.20)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Making Pipelines

In [None]:
# let's create a pipeline to do all of our preprocessing for us

In [None]:
preprocessing = make_column_transformer((OneHotEncoder(), make_column_selector(dtype_include=object)),
                                       (StandardScaler(), make_column_selector(dtype_include=np.number),
                                        SMOTE()))

preprocessing

In [None]:
# fit and transform our preprocessing pipeline to our training data
preprocessing.fit_transform(X_train)

In [None]:
# the next thing we'll do is make separate pipelines for each model we want to test
# each of these pipelines will contain our preprocessing pipeline

dt_pipeline = make_pipeline(preprocessing, DecisionTreeClassifier(random_state=2021))
rf_pipeline = make_pipeline(preprocessing, RandomForestClassifier(random_state=2021))
lr_pipeline = make_pipeline(preprocessing, LogisticRegression(random_state=2021))
et_pipeline = make_pipeline(preprocessing, ExtraTreesClassifier(random_state=2021))
kn_pipeline = make_pipeline(preprocessing, KNeighborsClassifier())

## Create Param_Grids

In [None]:
# different param_grids for each pipeline

dt_param_grid = {
    'decisiontreeclassifier__criterion': ['entropy', 'gini'],
    'decisiontreeclassifier__splitter': ['best', 'random'],
    'decisiontreeclassifier__max_depth': [2, 5, 10],
    'decisiontreeclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'decisiontreeclassifier__class_weight': ['none', 'balanced']
    
}

rf_param_grid = {
    'randomforestclassifier__n_estimators': [100, 1000, 2000],
    'randomforestclassifier__max_depth': [2, 5, 10]
    
}

lr_param_grid = {
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logisticregression__dual': [True, False],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logisticregression__multi_class': ['auto', 'ovr', 'multinomial'],
    'logisticregression__n_jobs': [10, 20, 30],
    'logisticregression__C': [0.01, 0.1, 0.5]
    
}

et_param_grid = {
    'extratreesclassifier__criterion': ['entropy', 'gini'],
    'extratreesclassifier__max_depth': [2, 5, 10],
    'extratreesclassifier__n_estimators': [100, 250, 500],
    'extratreesclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'extratreesclassifier__class_weight': ['none', 'balanced']
    
}

kn_param_grid = {
    'kneighborsclassifier__n_neighbors': [5, 10, 20],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'kneighborsclassifier__leaf_size': [25, 50, 100],
    'kneighborsclassifier__p': [1, 2],
    'kneighborsclassifier__metric': ['minkowski', 'manhattan']
    
}

## LogisticRegression

In [None]:
search_lr = GridSearchCV(lr_pipeline, lr_param_grid, n_jobs=-1)

search_lr.fit(X_train, y_train)

In [None]:
# we can check its best parameters
search_lr.best_params_

In [None]:
# assign best model to a variable using best_estimator_
best_lr_pipeline = search_lr.best_estimator_

In [None]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_lr_cross_val_acc = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='accuracy')
best_lr_cross_val_prec = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='precision')
best_lr_cross_val_rec = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='recall')
best_lr_cross_val_f1 = cross_val_score(best_lr_pipeline, X_train, y_train, scoring='f1')

## DecisionTree

In [None]:
search_dt = GridSearchCV(dt_pipeline, dt_param_grid, n_jobs=-1)

search_dt.fit(X_train, y_train)

In [None]:
# we can check its best parameters
search_dt.best_params_

In [None]:
# assign best model to a variable using best_estimator_
best_dt_pipeline = search_dt.best_estimator_

In [None]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_dt_cross_val_acc = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='accuracy')
best_dt_cross_val_prec= cross_val_score(best_dt_pipeline, X_train, y_train, scoring='precision')
best_dt_cross_val_rec = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='recall')
best_dt_cross_val_f1 = cross_val_score(best_dt_pipeline, X_train, y_train, scoring='f1')

## RandomForest

In [None]:
search_rf = GridSearchCV(rf_pipeline, rf_param_grid, n_jobs=-1)

search_rf.fit(X_train, y_train)

In [None]:
# we can check its best parameters
search_rf.best_params_

In [None]:
# assign best model to a variable using best_estimator_
best_rf_pipeline = search_rf.best_estimator_

In [None]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_rf_cross_val_acc = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='accuracy')
best_rf_cross_val_prec= cross_val_score(best_rf_pipeline, X_train, y_train, scoring='precision')
best_rf_cross_val_rec = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='recall')
best_rf_cross_val_f1 = cross_val_score(best_rf_pipeline, X_train, y_train, scoring='f1')

## ExtraTrees

In [None]:
search_et = GridSearchCV(et_pipeline, et_param_grid, n_jobs=-1)

search_et.fit(X_train, y_train)

In [None]:
# we can check its best parameters
search_et.best_params_

In [None]:
# assign best model to a variable using best_estimator_
best_et_pipeline = search_et.best_estimator_

In [None]:
# cross validation using the f1 score metric
best_et_cross_val = cross_val_score(best_et_pipeline, X_train, y_train, scoring='f1')

In [None]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_et_cross_val_acc = cross_val_score(best_et_pipeline, X_train, y_train, scoring='accuracy')
best_et_cross_val_prec= cross_val_score(best_et_pipeline, X_train, y_train, scoring='precision')
best_et_cross_val_rec = cross_val_score(best_et_pipeline, X_train, y_train, scoring='recall')
best_et_cross_val_f1 = cross_val_score(best_et_pipeline, X_train, y_train, scoring='f1')

## KNeighbors

In [None]:
# cross validation using the f1 score metric
best_kn_cross_val = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='f1')

In [None]:
search_kn = GridSearchCV(kn_pipeline, kn_param_grid, n_jobs=-1)

search_kn.fit(X_train, y_train)

In [None]:
# we can check its best parameters
search_kn.best_params_

In [None]:
# assign best model to a variable using best_estimator_
best_kn_pipeline = search_kn.best_estimator_

In [None]:
# cross validation using the accuracy, precision, recall, and f1 score metric
best_kn_cross_val_acc = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='accuracy')
best_kn_cross_val_prec= cross_val_score(best_kn_pipeline, X_train, y_train, scoring='precision')
best_kn_cross_val_rec = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='recall')
best_kn_cross_val_f1 = cross_val_score(best_kn_pipeline, X_train, y_train, scoring='f1')

## Comparison of Models

In [None]:
>>> d = {'col1': [1, 2], 'col2': [3, 4]}
>>> df = pd.DataFrame(data=d)

In [None]:
mean_f1_scores = {'LogisticRegression': best_lr_cross_val.mean(),
                  'DecisionsTree': best_dt_cross_val.mean(),
                  'RandomForest': best_rf_cross_val.mean(),
                  'ExtraTrees': None,
                  'KNeighbors': None}

f1_df = pd.DataFrame(data=mean_f1_scores, index=mean_f1_scores[0], columns=mean_f1_scores[1])

In [None]:
mean_f1_scores

In [None]:
models = pd.DataFrame({
    'Model': ['LogisticRegression', 'DecisionTree', 'RandomForest', 
              'ExtraTrees', 'KNeighbors'], 
    'Accuracy': [best_lg_cross_val_acc, best_dt_cross_val_acc, best_rf_cross_val_acc, 
               best_et_cross_val_acc, best_kn_cross_val_acc], 
    'Precision': [best_lg_cross_val_prec, best_dt_cross_val_prec, best_rf_cross_val_prec, 
               best_et_cross_val_prec, best_kn_cross_val_prec], 
    'Recall': [best_lg_cross_val_rec, best_dt_cross_val_rec, best_rf_cross_val_rec, 
               best_et_cross_val_rec, best_kn_cross_val_rec],
    'F1 Score': [best_lg_cross_val_f1, best_dt_cross_val_f1, best_rf_cross_val_f1, 
               best_et_cross_val_f1, best_kn_cross_val_f1]}) 

In [None]:
models.sort_values(by='Accuracy', ascending=False)

In [None]:
models.sort_values(by='Precision', ascending=False)

In [None]:
models.sort_values(by='F1 Score', ascending=False)

In [None]:
models.sort_values(by='Recall', ascending=False)