In [65]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from tabulate import tabulate

import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

np.random.seed(42)

In [66]:
def show_scores(clf, X, y):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)
    print(tabulate(confusion_matrix(y, y_pred), headers=['Predicted 0', 'Predicted 1'], tablefmt='orgtbl'))
    print()
    print(f'accuracy:              {round(accuracy_score(y, y_pred), 4)}')
    print(f'precision:             {round(precision_score(y, y_pred), 4)}')
    print(f'recall:                {round(recall_score(y, y_pred), 4)}')
    print(f'f1:                    {round(f1_score(y, y_pred), 4)}')
    print(f'roc_auc_discrete:      {round(roc_auc_score(y, y_pred), 4)}')
    print(f'roc_auc_continuous:    {round(roc_auc_score(y, y_pred_prob[:, 1]), 4)}')

# Decision tree

In [67]:
X_train = pd.read_csv("../preprocessed_data/X_train.csv")
y_train = pd.read_csv("../preprocessed_data/y_train.csv")

X_val  = pd.read_csv("../preprocessed_data/X_val.csv")
y_val  = pd.read_csv("../preprocessed_data/y_val.csv")

In [69]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(random_state=42)

In [70]:
from sklearn.model_selection import GridSearchCV

parameters = dict(max_depth=np.arange(1, 100))
depth_search = GridSearchCV(dct, cv=3, scoring='accuracy', return_train_score=True, param_grid=parameters, n_jobs=-1).fit(X_train, y_train)

In [71]:
res = depth_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 4), "   ", params)

0.895     {'max_depth': 1}
0.8993     {'max_depth': 2}
0.8995     {'max_depth': 3}
0.8992     {'max_depth': 4}
0.8992     {'max_depth': 5}
0.8986     {'max_depth': 6}
0.8975     {'max_depth': 7}
0.8969     {'max_depth': 8}
0.8942     {'max_depth': 9}
0.8922     {'max_depth': 10}
0.8897     {'max_depth': 11}
0.8868     {'max_depth': 12}
0.8834     {'max_depth': 13}
0.8796     {'max_depth': 14}
0.8754     {'max_depth': 15}
0.872     {'max_depth': 16}
0.868     {'max_depth': 17}
0.8633     {'max_depth': 18}
0.8585     {'max_depth': 19}
0.855     {'max_depth': 20}
0.8496     {'max_depth': 21}
0.8461     {'max_depth': 22}
0.8436     {'max_depth': 23}
0.8403     {'max_depth': 24}
0.8381     {'max_depth': 25}
0.8356     {'max_depth': 26}
0.834     {'max_depth': 27}
0.8319     {'max_depth': 28}
0.8288     {'max_depth': 29}
0.8294     {'max_depth': 30}
0.8279     {'max_depth': 31}
0.8264     {'max_depth': 32}
0.8235     {'max_depth': 33}
0.825     {'max_depth': 34}
0.8242     {'max_depth': 35}


In [72]:
show_scores(depth_search.best_estimator_, X_train, y_train)
show_scores(depth_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44549 |           256 |
|          4872 |          1411 |

accuracy:              0.8996
precision:             0.8464
recall:                0.2246
f1:                    0.355
roc_auc_discrete:      0.6094
roc_auc_continuous:    0.7142
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         19073 |           129 |
|          2121 |           572 |

accuracy:              0.8972
precision:             0.816
recall:                0.2124
f1:                    0.3371
roc_auc_discrete:      0.6028
roc_auc_continuous:    0.7111


In [9]:
from sklearn.model_selection import RandomizedSearchCV

parameters = dict(max_depth=np.arange(25, 65, 3), min_samples_split=np.arange(2, 8),
                  min_samples_leaf=np.arange(1, 20, 2), max_features=np.arange(20, 150, 5))

rand_search = RandomizedSearchCV(dct, scoring='roc_auc', cv=3, return_train_score=True, param_distributions=parameters,n_iter=100, n_jobs=-1, random_state=42).fit(X_train, y_train)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         19073 |           129 |
|          2121 |           572 |

accuracy:              0.8972
precision:             0.816
recall:                0.2124
f1:                    0.3371
roc_auc_discrete:      0.6028
roc_auc_continuous:    0.7111


In [10]:
show_scores(rand_search.best_estimator_, X_train, y_train)
show_scores(rand_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44396 |           409 |
|          4606 |          1677 |

accuracy:              0.9018
precision:             0.8039
recall:                0.2669
f1:                    0.4008
roc_auc_discrete:      0.6289
roc_auc_continuous:    0.8924
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         18936 |           266 |
|          2090 |           603 |

accuracy:              0.8924
precision:             0.6939
recall:                0.2239
f1:                    0.3386
roc_auc_discrete:      0.605
roc_auc_continuous:    0.6872
