In [1]:
from libraries import *

input_train = pd.read_pickle('input_train.pkl')
input_test = pd.read_pickle('input_test.pkl')
target_train = pd.read_pickle('target_train.pkl')
target_test = pd.read_pickle('target_test')

# Model Pipelines

In [None]:
from sklearn.pipeline import make_pipeline # to set up the steps you follow with each model
from sklearn.preprocessing import StandardScaler # to undo the influence of magnitude of large values

from sklearn.linear_model import LogisticRegression # L1 and L2
from sklearn.ensemble import RandomForestClassifier # rf
from sklearn.ensemble import GradientBoostingClassifier # gb

In [None]:
pipelines = {
    'l1' : make_pipeline(StandardScaler(), LogisticRegression(penalty= 'l1', random_state= 1, solver='liblinear')),
    'l2' : make_pipeline(StandardScaler(), LogisticRegression(penalty= 'l2', random_state= 1, solver='liblinear')),
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state= 1)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state= 1))
}

In [None]:
# verify pipeline

for key, value in pipelines.items():
    print(key, type(value))

l1 <class 'sklearn.pipeline.Pipeline'>
l2 <class 'sklearn.pipeline.Pipeline'>
rf <class 'sklearn.pipeline.Pipeline'>
gb <class 'sklearn.pipeline.Pipeline'>


# Hyperparameter Tuning

In [None]:
# create hyperparameter grids

l1_hyperparameters = {
    'logisticregression__C' : [0.1, 1, 10]
}

l2_hyperparameters = {
    'logisticregression__C' : [0.1, 1, 10]
}

rf_hyperparameters = {
    'randomforestclassifier__n_estimators' : [100, 200],
    'randomforestclassifier__max_features' : [None, 0.3, 0.6]
}

gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators' : [100, 200],
    'gradientboostingclassifier__learning_rate' : [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth' : [1, 3, 5]
}

In [None]:
hyperparameters = {
    'l1' : l1_hyperparameters,
    'l2' : l2_hyperparameters,
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters
}

In [None]:
for key, value in hyperparameters.items():
    print('Key:', key, 'Value:', value)

Key: l1 Value: {'logisticregression__C': [0.1, 1, 10]}
Key: l2 Value: {'logisticregression__C': [0.1, 1, 10]}
Key: rf Value: {'randomforestclassifier__n_estimators': [100, 200], 'randomforestclassifier__max_features': [None, 0.3, 0.6]}
Key: gb Value: {'gradientboostingclassifier__n_estimators': [100, 200], 'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2], 'gradientboostingclassifier__max_depth': [1, 3, 5]}


In [None]:
# verify HPG were successfully created

for key in ['l1', 'l2', 'rf', 'gb']:
    if key in hyperparameters:
        if type(hyperparameters[key]) is dict :
            print(key, 'was found, and it is a grid.')
        else:
            print(key, 'was found, but is not a grid.')
    else:
        print(key, 'was not found.')


l1 was found, and it is a grid.
l2 was found, and it is a grid.
rf was found, and it is a grid.
gb was found, and it is a grid.


# Cross Validation

In [None]:
# create untrained model
# train model and tune hyperparameters
# make predictions
# compare actual target values

In [None]:
from sklearn.model_selection import GridSearchCV

# GridSearchCV finds the best hyperparameters for each model

## Setting up untrained models

In [None]:
# create an empty dictionary named models

models = {}

for key in pipelines.keys():
    models[key] = GridSearchCV(pipelines[key], hyperparameters[key], cv= 5)

print(models.keys())

dict_keys(['l1', 'l2', 'rf', 'gb'])


# Train models

In [None]:
for model_name in models.keys():
    models[model_name].fit(input_train, target_train)
    print(model_name, 'is trained and tuned.')

l1 is trained and tuned.
l2 is trained and tuned.
rf is trained and tuned.
gb is trained and tuned.


# Model Evaluation

R**2 asks the question how much of the variance in the predicted result can be explained by the input variables? \
This is not the question asked in a binary classifier. \
To mark an email as spam with 99% accuaracy is not helpful, because even fewer emails are spam. \
The question is 'how good is this model at distinguishing a positive from a negative case?' \
This value can be presented as AUC, area under the curve that indicates ratio of true positive to false positives, for example 0.9645

In [None]:
from sklearn.metrics import confusion_matrix

for model_name in models:
    print(model_name)
    pred = models[model_name].predict(input_test)
    print(confusion_matrix(target_test, pred))
    print('---')

l1
[[1124   22]
 [  23  278]]
---
l2
[[1123   23]
 [  23  278]]
---
rf
[[1137    9]
 [  22  279]]
---
gb
[[1136   10]
 [  16  285]]
---


In [None]:
# Area under the curve

from sklearn.metrics import roc_curve, auc

for model_name in models:
    prod = models[model_name].predict(input_test)
    fpr, tpr, thresholds = roc_curve(target_test, prod)
    print(model_name)
    print('AUROC = ', round(auc(fpr,tpr), 4))
    print('---')

l1
AUROC =  0.9522
---
l2
AUROC =  0.9518
---
rf
AUROC =  0.9595
---
gb
AUROC =  0.9691
---
