In [81]:
import numpy as np
import sys

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import GridSearchCV

In [72]:
INPUT_DIR = "./processed_data/"

In [73]:
def load_features():
    """
    Load pre-processed feature matrices.
    """
    Xs = np.load(INPUT_DIR + 'X.npz')
    ys = np.load(INPUT_DIR + 'y.npz')
    X_train, X_test = Xs['train'], Xs['test']
    y_train, y_test = ys['train'], ys['test']

    return X_train, X_test, y_train, y_test

In [61]:
"""
Title:       train.py
Description: Collection of functions to train the model.
Author:      Kunyu He, CAPP'20
"""

import numpy as np
import sys

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import GridSearchCV


INPUT_DIR = "./processed_data/"
MODEL_NAMES = ["KNN", "Decision Tree", "Random Forest"]
METRICS = ["ROC AUC", "Accuracy"]
GRID_SEARCH_PARAMS = {"KNN": {'n_neighbors': list(range(1, 41, 2)),
                              'p': list(range(1, 4))
                              },
                      "Decision Tree": {'criterion': ["entropy", "gini"],
                            'min_samples_split': list(np.arange(0.01, 0.11, 0.01)),
                            'max_depth': list(range(1, 11)),
                            'max_features': list(range(4, 17, 2))
                            },
                      "Random Forest": {'n_estimators': list(range(100, 500, 100)),
                            'min_samples_split': list(np.arange(0.01, 0.06, 0.01)),
                            'max_depth': list(range(4, 11)),
                            'max_features': list(range(4, 17, 2))
                            }
                      }


#----------------------------------------------------------------------------#
def load_features():
    """
    Load pre-processed feature matrices.
    """
    Xs = np.load(INPUT_DIR + 'X.npz')
    ys = np.load(INPUT_DIR + 'y.npz')
    X_train, X_test = Xs['train'], Xs['test']
    y_train, y_test = ys['train'], ys['test']

    return X_train, X_test, y_train, y_test


def evaluate(classifier, X_test, y_test, metric="ROC AUC"):
    """
    Evaluate the fitted classifier on the test set and calculate the
    evaluation metrics.
    """
    y_pred = classifier.predict(X_test)

    if metric == "Accuracy":
        return accuracy_score(y_test, y_pred)

    fp_rate, tp_rate, thresholds = roc_curve(y_test, y_pred)
    return auc(fp_rate, tp_rate)


def build_benchmark(X_train, y_train, X_test, y_test, metric="ROC AUC"):
    """
    """
    benchmark_classifier = DecisionTreeClassifier(random_state=123)
    benchmark_classifier.fit(X_train, y_train)

    return evaluate(benchmark_classifier, X_test, y_test, metric=metric)


def tune(model, parameters, X_train, y_train, metric="ROC AUC", n_folds=5,
         default_args={}):
    """
    Use grid search and cross validation to find the best set of hyper-
    parameters.
    """
    classifier = model(**default_args)
    if metric == "ROC AUC":
        score = "roc_auc"
    else:
        score = "accuracy"

    grid = GridSearchCV(classifier, param_grid=parameters, scoring=score,
                        n_jobs=-1, cv=n_folds, iid=True, verbose=5)
    grid.fit(X_train, y_train)
    
    return model(**grid.best_params_, **default_args), grid

In [62]:
model_index, metric_index, folds = 3, 1, 3
metric_name = ["ROC AUC", "Accuracy"][metric_index - 1]
model_name = ["KNN", "Decision Tree", "Random Forest"][model_index - 1]

In [63]:
benchmark_score = build_benchmark(X_train, y_train, X_test, y_test,
                                  metric=metric_name)
print("{} of the benchmark default decision tree model is {}.".\
      format(metric_name, round(benchmark_score, 3)))


ROC AUC of the benchmark default decision tree model is 0.659.


In [64]:
parameters = GRID_SEARCH_PARAMS[model_name]
args = [parameters, X_train, y_train]
op_args = {'metric': metric_name, 'n_folds': folds}

In [65]:
if model_name == "KNN":
    best_classifier, grid = tune(KNeighborsClassifier, *args, **op_args,
                                 default_args={'n_jobs': -1})
elif model_name == "Decision Tree":
    best_classifier, grid = tune(DecisionTreeClassifier, *args, **op_args,
                                 default_args={'random_state': 123})
else:
    best_classifier, grid = tune(RandomForestClassifier, *args, **op_args,
                                 default_args={'random_state': 123,
                                               'oob_score': True})
print("Found the best set of parameters for {} Classifier: {}".\
      format(model_name, grid.best_params_))

Fitting 3 folds for each of 980 candidates, totalling 2940 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 48.3min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 58.8min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 71.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 86.7min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 105.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 214.9min
[Parallel(n_jobs=-1)]: Done 2940 out of 2940 | elapsed: 233.5min finished


Found the best set of parameters for Random Forest Classifier: {'max_depth': 10, 'max_features': 4, 'min_samples_split': 0.01, 'n_estimators': 300}


In [66]:
best_classifier.fit(X_train, y_train)
best_score = evaluate(best_classifier, X_test, y_test, metric=metric_name)
diff = round(best_score - benchmark_score, 3)
print("{} of the tuned {} is {}, {} {} than the benchmark.".\
      format(metric_name, model_name, round(best_score, 3), diff,
             ['higher', 'lower'][int(diff <= 0)]))

ROC AUC of the tuned Random Forest is 0.683, 0.024 higher than the benchmark.


In [67]:
grid.best_score_

0.8728400433573856

In [3]:
{'n_neighbors': list(range(1, 41, 2)), 'p': list(range(1, 4))}

{'n_neighbors': [1,
  3,
  5,
  7,
  9,
  11,
  13,
  15,
  17,
  19,
  21,
  23,
  25,
  27,
  29,
  31,
  33,
  35,
  37,
  39],
 'p': [1, 2, 3]}

In [19]:
X_train.shape

(30762, 16)

In [20]:
 {'criterion': ["entropy", "gini"],
                            'min_samples_split': list(range(0.01, 0.11, 0.01)),
                            'max_features': list(range(4, 17, 2))}

TypeError: 'float' object cannot be interpreted as an integer

In [24]:
list(range(0.01, 0.11, 0.02))

TypeError: 'float' object cannot be interpreted as an integer

In [29]:
list(np.arange(0.01, 0.13, 0.02))

[0.01,
 0.03,
 0.049999999999999996,
 0.06999999999999999,
 0.08999999999999998,
 0.10999999999999997]