In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from tabulate import tabulate

import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

np.random.seed(42)

In [3]:
def show_scores(clf, X, y):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)
    print(tabulate(confusion_matrix(y, y_pred), headers=['Predicted 0', 'Predicted 1'], tablefmt='orgtbl'))
    print()
    print(f'accuracy:              {round(accuracy_score(y, y_pred), 4)}')
    print(f'precision:             {round(precision_score(y, y_pred), 4)}')
    print(f'recall:                {round(recall_score(y, y_pred), 4)}')
    print(f'f1:                    {round(f1_score(y, y_pred), 4)}')
    print(f'roc_auc_discrete:      {round(roc_auc_score(y, y_pred), 4)}')
    print(f'roc_auc_continuous:    {round(roc_auc_score(y, y_pred_prob[:, 1]), 4)}')

In [4]:
X_train = pd.read_csv("../preprocessed_data/X_train.csv")
y_train = pd.read_csv("../preprocessed_data/y_train.csv")

X_val  = pd.read_csv("../preprocessed_data/X_val.csv")
y_val  = pd.read_csv("../preprocessed_data/y_val.csv")

In [5]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

In [6]:
show_scores(dct, X_train, y_train)
show_scores(dct, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44795 |             0 |
|             0 |          6293 |

accuracy:              1.0
precision:             1.0
recall:                1.0
f1:                    1.0
roc_auc_discrete:      1.0
roc_auc_continuous:    1.0
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         17004 |          2208 |
|          1788 |           895 |

accuracy:              0.8175
precision:             0.2884
recall:                0.3336
f1:                    0.3094
roc_auc_discrete:      0.6093
roc_auc_continuous:    0.6093


In [7]:
from sklearn.model_selection import GridSearchCV

parameters = dict(max_depth=np.arange(1, 100))
depth_search = GridSearchCV(dct, cv=3, scoring='precision', return_train_score=True, param_grid=parameters, n_jobs=-1).fit(X_train, y_train)

In [13]:
depth_search.best_params_

{'max_depth': 2}

In [8]:
res = depth_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 4), "   ", params)

0.706     {'max_depth': 1}
0.8383     {'max_depth': 2}
0.8075     {'max_depth': 3}
0.8302     {'max_depth': 4}
0.8276     {'max_depth': 5}
0.8349     {'max_depth': 6}
0.8155     {'max_depth': 7}
0.7916     {'max_depth': 8}
0.7503     {'max_depth': 9}
0.7173     {'max_depth': 10}
0.6463     {'max_depth': 11}
0.609     {'max_depth': 12}
0.5599     {'max_depth': 13}
0.5304     {'max_depth': 14}
0.4952     {'max_depth': 15}
0.4611     {'max_depth': 16}
0.439     {'max_depth': 17}
0.4133     {'max_depth': 18}
0.3942     {'max_depth': 19}
0.3785     {'max_depth': 20}
0.3605     {'max_depth': 21}
0.3513     {'max_depth': 22}
0.3395     {'max_depth': 23}
0.3316     {'max_depth': 24}
0.329     {'max_depth': 25}
0.3253     {'max_depth': 26}
0.3171     {'max_depth': 27}
0.3126     {'max_depth': 28}
0.3086     {'max_depth': 29}
0.3061     {'max_depth': 30}
0.3069     {'max_depth': 31}
0.3048     {'max_depth': 32}
0.3012     {'max_depth': 33}
0.2992     {'max_depth': 34}
0.299     {'max_depth': 35}

In [9]:
show_scores(depth_search.best_estimator_, X_train, y_train)
show_scores(depth_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44534 |           261 |
|          4945 |          1348 |

accuracy:              0.8981
precision:             0.8378
recall:                0.2142
f1:                    0.3412
roc_auc_discrete:      0.6042
roc_auc_continuous:    0.7065
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         19089 |           123 |
|          2072 |           611 |

accuracy:              0.8997
precision:             0.8324
recall:                0.2277
f1:                    0.3576
roc_auc_discrete:      0.6107
roc_auc_continuous:    0.6971


In [17]:
from sklearn.model_selection import RandomizedSearchCV

parameters = dict(max_depth=np.arange(25, 65, 3), min_samples_split=np.arange(2, 8),
                  min_samples_leaf=np.arange(1, 20, 2), max_features=np.arange(20, 150, 5))

rand_search = RandomizedSearchCV(dct, scoring='accuracy', cv=3, return_train_score=True, param_distributions=parameters,n_iter=100, n_jobs=-1, random_state=42).fit(X_train, y_train)

In [18]:
rand_search.best_params_

{'min_samples_split': 7,
 'min_samples_leaf': 19,
 'max_features': 70,
 'max_depth': 64}

In [19]:
show_scores(rand_search.best_estimator_, X_train, y_train)
show_scores(rand_search.best_estimator_, X_val, y_val)

|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         44321 |           474 |
|          4527 |          1766 |

accuracy:              0.9021
precision:             0.7884
recall:                0.2806
f1:                    0.4139
roc_auc_discrete:      0.635
roc_auc_continuous:    0.8953
|   Predicted 0 |   Predicted 1 |
|---------------+---------------|
|         18894 |           318 |
|          2009 |           674 |

accuracy:              0.8937
precision:             0.6794
recall:                0.2512
f1:                    0.3668
roc_auc_discrete:      0.6173
roc_auc_continuous:    0.6984
