In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from tabulate import tabulate

import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

np.random.seed(42)

In [3]:
def show_scores(clf, X, y):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)
    print(tabulate(confusion_matrix(y, y_pred), headers=['Predicted 0', 'Predicted 1'], tablefmt='orgtbl'))
    print()
    print(f'accuracy:              {round(accuracy_score(y, y_pred), 4)}')
    print(f'precision:             {round(precision_score(y, y_pred), 4)}')
    print(f'recall:                {round(recall_score(y, y_pred), 4)}')
    print(f'f1:                    {round(f1_score(y, y_pred), 4)}')
    print(f'roc_auc_discrete:      {round(roc_auc_score(y, y_pred), 4)}')
    print(f'roc_auc_continuous:    {round(roc_auc_score(y, y_pred_prob[:, 1]), 4)}')

# Voting

In [4]:
X_train = pd.read_csv("../preprocessed_data/X_train.csv")
y_train = pd.read_csv("../preprocessed_data/y_train.csv")

X_val  = pd.read_csv("../preprocessed_data/X_val.csv")
y_val  = pd.read_csv("../preprocessed_data/y_val.csv")

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [51]:
log_reg = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', C=14.38)
dct = DecisionTreeClassifier(random_state=42, class_weight='balanced')
rfc = RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=16,random_state=42)
knn = KNeighborsClassifier(metric='minkowski', n_neighbors=10, weights='distance')
xgb_clf = xgb.XGBClassifier(random_state=42, subsample=1, n_estimators=100, max_depth=3, learning_rate=0.3, gamma=5, colsample_bytree=0.8)

In [78]:
estimators=[('LR', log_reg), ('RandomForest', rfc), ('XGB', xgb_clf)]
vc = VotingClassifier(estimators=estimators, voting='soft').fit(X_train, y_train)

In [82]:
from sklearn.model_selection import RandomizedSearchCV

parameters = dict(weights=[[i, j, k] for i in range(5) for j in range(10) for k in range (10)])

rand_search = RandomizedSearchCV(vc, scoring='accuracy', cv=3, return_train_score=True, param_distributions=parameters, n_iter=100, n_jobs=-1, random_state=42).fit(X_train, y_train)

In [85]:
res = rand_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 4), "   ", params)

0.8971     {'weights': [3, 6, 1]}
0.8997     {'weights': [0, 7, 3]}
0.8987     {'weights': [3, 7, 4]}
0.8994     {'weights': [1, 5, 5]}
0.8988     {'weights': [1, 0, 4]}
0.8988     {'weights': [3, 9, 4]}
0.8989     {'weights': [3, 7, 7]}
0.8991     {'weights': [1, 2, 4]}
0.8995     {'weights': [0, 6, 8]}
0.8911     {'weights': [4, 5, 0]}
0.8995     {'weights': [0, 0, 9]}
0.8995     {'weights': [1, 9, 4]}
0.894     {'weights': [4, 0, 6]}
0.8996     {'weights': [0, 8, 4]}
0.8975     {'weights': [3, 7, 1]}
0.899     {'weights': [3, 8, 8]}
0.8986     {'weights': [4, 9, 5]}
0.8995     {'weights': [0, 3, 0]}
0.8976     {'weights': [3, 1, 6]}
0.8963     {'weights': [4, 0, 8]}
0.8967     {'weights': [4, 9, 0]}
0.8973     {'weights': [4, 9, 1]}
0.8987     {'weights': [2, 8, 0]}
0.8987     {'weights': [3, 5, 6]}
0.8996     {'weights': [0, 7, 6]}
0.8954     {'weights': [4, 6, 1]}
0.8989     {'weights': [4, 9, 7]}
0.8881     {'weights': [2, 1, 1]}
0.8881     {'weights': [1, 0, 1]}
0.8973     {'wei

In [84]:
show_scores(rand_search.best_estimator_, X_train, y_train)
show_scores(rand_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44749 |            56 |
|          4533 |          1750 |

accuracy:              0.9102
precision:             0.969
recall:                0.2785
f1:                    0.4327
roc_auc_discrete:      0.6386
roc_auc_continuous:    0.9603
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         19076 |           126 |
|          2110 |           583 |

accuracy:              0.8979
precision:             0.8223
recall:                0.2165
f1:                    0.3427
roc_auc_discrete:      0.605
roc_auc_continuous:    0.7557
