In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from tabulate import tabulate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

np.random.seed(42)

In [2]:
def show_scores(clf, X, y):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)
    print(tabulate(confusion_matrix(y, y_pred), headers=['Predicted 0', 'Predicted 1'], tablefmt='orgtbl'))
    print()
    print(f'accuracy:              {round(accuracy_score(y, y_pred), 4)}')
    print(f'precision:             {round(precision_score(y, y_pred), 4)}')
    print(f'recall:                {round(recall_score(y, y_pred), 4)}')
    print(f'f1:                    {round(f1_score(y, y_pred), 4)}')
    print(f'roc_auc_discrete:      {round(roc_auc_score(y, y_pred), 4)}')
    print(f'roc_auc_continuous:    {round(roc_auc_score(y, y_pred_prob[:, 1]), 4)}')

In [3]:
X_train = pd.read_csv("../preprocessed_data/X_train.csv")
y_train = pd.read_csv("../preprocessed_data/y_train.csv")

X_val  = pd.read_csv("../preprocessed_data/X_val.csv")
y_val  = pd.read_csv("../preprocessed_data/y_val.csv")

In [4]:
log_reg = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', C=14.38)
rfc = RandomForestClassifier(random_state=42, n_jobs=-1,n_estimators=100,max_depth=16)
xgb_clf = xgb.XGBClassifier(random_state=42, subsample=1, n_estimators=100, max_depth=3, learning_rate=0.3, gamma=5, colsample_bytree=0.8)
dct = DecisionTreeClassifier(random_state=42, max_depth=64, min_samples_split=7, min_samples_leaf=19,max_features=70)

In [5]:
from sklearn.model_selection import RandomizedSearchCV

parameters = dict(weights=[[i, j, k, l] for i in range(5) for j in range(5) for k in range (5, 10) for l in range (5, 10)])

estimators=[('DCT', dct), ('LR', log_reg), ('RandomForest', rfc), ('XGB', xgb_clf)]
rand_search = RandomizedSearchCV(VotingClassifier(estimators=estimators, voting='soft'), scoring='accuracy', cv=3, return_train_score=True, param_distributions=parameters, n_iter=100, n_jobs=1, random_state=42).fit(X_train, y_train)

In [11]:
rand_search.best_params_

{'weights': [3, 0, 9, 5]}

In [7]:
res = rand_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 4), "   ", params)

0.8987     {'weights': [3, 2, 9, 7]}
0.8975     {'weights': [3, 4, 7, 5]}
0.898     {'weights': [1, 3, 8, 5]}
0.8981     {'weights': [1, 3, 7, 7]}
0.8975     {'weights': [3, 4, 6, 5]}
0.8989     {'weights': [4, 0, 8, 5]}
0.8985     {'weights': [1, 1, 7, 8]}
0.8978     {'weights': [0, 3, 5, 7]}
0.8984     {'weights': [4, 1, 5, 7]}
0.8989     {'weights': [1, 0, 9, 5]}
0.8989     {'weights': [2, 0, 8, 5]}
0.8985     {'weights': [0, 1, 6, 6]}
0.8984     {'weights': [0, 2, 6, 5]}
0.8982     {'weights': [0, 3, 8, 5]}
0.8984     {'weights': [4, 2, 6, 7]}
0.8978     {'weights': [0, 3, 5, 6]}
0.8988     {'weights': [0, 0, 5, 7]}
0.898     {'weights': [2, 4, 8, 8]}
0.899     {'weights': [3, 0, 9, 5]}
0.8987     {'weights': [2, 1, 7, 6]}
0.8988     {'weights': [1, 0, 6, 6]}
0.8989     {'weights': [1, 0, 9, 8]}
0.8986     {'weights': [0, 1, 9, 9]}
0.8979     {'weights': [4, 4, 9, 6]}
0.8985     {'weights': [3, 2, 7, 7]}
0.8972     {'weights': [0, 4, 7, 5]}
0.8988     {'weights': [2, 0, 6, 7]}
0.89

In [8]:
show_scores(rand_search.best_estimator_, X_train, y_train)
show_scores(rand_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44680 |           115 |
|          4696 |          1597 |

accuracy:              0.9058
precision:             0.9328
recall:                0.2538
f1:                    0.399
roc_auc_discrete:      0.6256
roc_auc_continuous:    0.9484
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         19095 |           117 |
|          2051 |           632 |

accuracy:              0.901
precision:             0.8438
recall:                0.2356
f1:                    0.3683
roc_auc_discrete:      0.6147
roc_auc_continuous:    0.7579
