In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from tabulate import tabulate

import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

np.random.seed(42)

In [2]:
def show_scores(clf, X, y):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)
    print(tabulate(confusion_matrix(y, y_pred), headers=['Predicted 0', 'Predicted 1'], tablefmt='orgtbl'))
    print()
    print(f'accuracy:              {round(accuracy_score(y, y_pred), 4)}')
    print(f'precision:             {round(precision_score(y, y_pred), 4)}')
    print(f'recall:                {round(recall_score(y, y_pred), 4)}')
    print(f'f1:                    {round(f1_score(y, y_pred), 4)}')
    print(f'roc_auc_discrete:      {round(roc_auc_score(y, y_pred), 4)}')
    print(f'roc_auc_continuous:    {round(roc_auc_score(y, y_pred_prob[:, 1]), 4)}')

# Voting

In [3]:
X_train = pd.read_csv("../preprocessed_data/X_train.csv")
y_train = pd.read_csv("../preprocessed_data/y_train.csv")

X_val  = pd.read_csv("../preprocessed_data/X_val.csv")
y_val  = pd.read_csv("../preprocessed_data/y_val.csv")

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [5]:
log_reg = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', C=14.38)
dct = DecisionTreeClassifier(random_state=42, class_weight='balanced')
rfc = RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=16,random_state=42)
xgb_clf = xgb.XGBClassifier(random_state=42, subsample=1, n_estimators=100, max_depth=3, learning_rate=0.3, gamma=5, colsample_bytree=0.8)

In [6]:
estimators=[('LR', log_reg), ('RandomForest', rfc), ('XGB', xgb_clf)]
vc = VotingClassifier(estimators=estimators, voting='soft').fit(X_train, y_train)

In [7]:
from sklearn.model_selection import RandomizedSearchCV

parameters = dict(weights=[[i, j, k] for i in range(5) for j in range(10) for k in range (10)])

rand_search = RandomizedSearchCV(vc, scoring='accuracy', cv=3, return_train_score=True, param_distributions=parameters, n_iter=100, n_jobs=-1, random_state=42).fit(X_train, y_train)

In [10]:
rand_search.best_params_

{'weights': [0, 7, 3]}

In [8]:
res = rand_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 4), "   ", params)

0.8963     {'weights': [3, 6, 1]}
0.8991     {'weights': [0, 7, 3]}
0.8977     {'weights': [3, 7, 4]}
0.8984     {'weights': [1, 5, 5]}
0.8974     {'weights': [1, 0, 4]}
0.898     {'weights': [3, 9, 4]}
0.8981     {'weights': [3, 7, 7]}
0.8981     {'weights': [1, 2, 4]}
0.8988     {'weights': [0, 6, 8]}
0.8903     {'weights': [4, 5, 0]}
0.8984     {'weights': [0, 0, 9]}
0.8988     {'weights': [1, 9, 4]}
0.8933     {'weights': [4, 0, 6]}
0.8991     {'weights': [0, 8, 4]}
0.8969     {'weights': [3, 7, 1]}
0.8982     {'weights': [3, 8, 8]}
0.8975     {'weights': [4, 9, 5]}
0.8991     {'weights': [0, 3, 0]}
0.8958     {'weights': [3, 1, 6]}
0.895     {'weights': [4, 0, 8]}
0.8961     {'weights': [4, 9, 0]}
0.8965     {'weights': [4, 9, 1]}
0.8976     {'weights': [2, 8, 0]}
0.8977     {'weights': [3, 5, 6]}
0.899     {'weights': [0, 7, 6]}
0.8945     {'weights': [4, 6, 1]}
0.8978     {'weights': [4, 9, 7]}
0.8862     {'weights': [2, 1, 1]}
0.8861     {'weights': [1, 0, 1]}
0.8962     {'weig

In [9]:
show_scores(rand_search.best_estimator_, X_train, y_train)
show_scores(rand_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44710 |            85 |
|          4653 |          1640 |

accuracy:              0.9073
precision:             0.9507
recall:                0.2606
f1:                    0.4091
roc_auc_discrete:      0.6294
roc_auc_continuous:    0.9458
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         19081 |           131 |
|          2048 |           635 |

accuracy:              0.9005
precision:             0.829
recall:                0.2367
f1:                    0.3682
roc_auc_discrete:      0.6149
roc_auc_continuous:    0.7527
