In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from tabulate import tabulate

import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

np.random.seed(42)

In [20]:
def show_scores(clf, X, y):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)
    print(tabulate(confusion_matrix(y, y_pred), headers=['Predicted 0', 'Predicted 1'], tablefmt='orgtbl'))
    print()
    print(f'accuracy:              {round(accuracy_score(y, y_pred), 4)}')
    print(f'precision:             {round(precision_score(y, y_pred), 4)}')
    print(f'recall:                {round(recall_score(y, y_pred), 4)}')
    print(f'f1:                    {round(f1_score(y, y_pred), 4)}')
    print(f'roc_auc_discrete:      {round(roc_auc_score(y, y_pred), 4)}')
    print(f'roc_auc_continuous:    {round(roc_auc_score(y, y_pred_prob[:, 1]), 4)}')

# Logstic regression

In [21]:
X_train = pd.read_csv("../preprocessed_data/X_train.csv")
y_train = pd.read_csv("../preprocessed_data/y_train.csv")

X_val  = pd.read_csv("../preprocessed_data/X_val.csv")
y_val  = pd.read_csv("../preprocessed_data/y_val.csv")

In [22]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42, class_weight='balanced').fit(X_train, y_train)


In [23]:
show_scores(log_reg, X_train, y_train)
show_scores(log_reg, X_val, y_val)


|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         34818 |          9987 |
|          2529 |          3754 |

accuracy:              0.755
precision:             0.2732
recall:                0.5975
f1:                    0.375
roc_auc_discrete:      0.6873
roc_auc_continuous:    0.7571
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         14807 |          4395 |
|          1102 |          1591 |

accuracy:              0.7489
precision:             0.2658
recall:                0.5908
f1:                    0.3666
roc_auc_discrete:      0.681
roc_auc_continuous:    0.75


In [15]:
from sklearn.model_selection import GridSearchCV

clf = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear')

parameters = dict(C=np.logspace(-6, 2, 20), penalty=['l1', 'l2'])
reg_search = GridSearchCV(clf, cv=3, scoring='balanced_accuracy', return_train_score=True, param_grid=parameters, n_jobs=-1).fit(X_train, y_train)

In [18]:
res = reg_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 4), "   ", params)

0.5     {'C': 1e-06, 'penalty': 'l1'}
0.5482     {'C': 1e-06, 'penalty': 'l2'}
0.5     {'C': 2.6366508987303555e-06, 'penalty': 'l1'}
0.56     {'C': 2.6366508987303555e-06, 'penalty': 'l2'}
0.5     {'C': 6.951927961775606e-06, 'penalty': 'l1'}
0.5851     {'C': 6.951927961775606e-06, 'penalty': 'l2'}
0.5     {'C': 1.8329807108324375e-05, 'penalty': 'l1'}
0.6148     {'C': 1.8329807108324375e-05, 'penalty': 'l2'}
0.5     {'C': 4.8329302385717524e-05, 'penalty': 'l1'}
0.6303     {'C': 4.8329302385717524e-05, 'penalty': 'l2'}
0.5     {'C': 0.00012742749857031334, 'penalty': 'l1'}
0.648     {'C': 0.00012742749857031334, 'penalty': 'l2'}
0.59     {'C': 0.0003359818286283781, 'penalty': 'l1'}
0.662     {'C': 0.0003359818286283781, 'penalty': 'l2'}
0.6635     {'C': 0.0008858667904100823, 'penalty': 'l1'}
0.6756     {'C': 0.0008858667904100823, 'penalty': 'l2'}
0.6713     {'C': 0.002335721469090121, 'penalty': 'l1'}
0.6801     {'C': 0.002335721469090121, 'penalty': 'l2'}
0.6751     {'C': 0.00615

In [16]:
show_scores(reg_search.best_estimator_, X_train, y_train)
show_scores(reg_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         34743 |         10062 |
|          2516 |          3767 |

accuracy:              0.7538
precision:             0.2724
recall:                0.5996
f1:                    0.3746
roc_auc_discrete:      0.6875
roc_auc_continuous:    0.7572
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         14762 |          4440 |
|          1096 |          1597 |

accuracy:              0.7472
precision:             0.2645
recall:                0.593
f1:                    0.3659
roc_auc_discrete:      0.6809
roc_auc_continuous:    0.7501
