## Benchmark for measuring performance of engineered features and models
When engineering new features and creating new models, I will need to compare new methods with benchmark to decide if there is any improvement.

In [34]:
import os
if os.name=='nt':
    mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
    os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate, cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sys import getsizeof
import time
import gc

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [7]:
import pickle

def save_pickle(x, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pickle(filename):
    with open(filename, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [64]:
from numba import jit

@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    new_mcc = 0
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc
    
def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', -best_mcc

In [65]:
def cross_val_predict_rskf(clf, x_train, y_train, n_splits=3, 
                           n_repeats=2, random_state=42, verbose=False):
    '''
    Repeated stratified KFold CV, returns predictions for 
    each repeat and average score.
    n_repeats: repetitions of CV
    '''
    scores = []
    n_trees = []
    clfs = []

    rskf = RepeatedStratifiedKFold(n_repeats=n_repeats, n_splits=n_splits, 
                                   random_state=0)
    np.random.seed(random_state)
    for n, (train_index, test_index) in enumerate(rskf.split(x_train, y_train)):
        x_train_tmp, x_test_tmp = x_train.iloc[train_index], x_train.iloc[test_index]
        y_train_tmp, y_test_tmp = y_train.iloc[train_index], y_train.iloc[test_index]

        clf.random_state = np.random.randint(10000000)

        clf.fit(x_train_tmp, y_train_tmp, 
                eval_set=[(x_test_tmp, y_test_tmp)], 
                eval_metric=mcc_eval, early_stopping_rounds=10,
                verbose=verbose)
        scores.append(-clf.best_score)
        n_trees.append(clf.best_ntree_limit)
        clfs.append(clf)
        print('Split {}, score = {:.3f}, best_ntree_limit = {}'.format(n, scores[n], clf.best_ntree_limit))

    print('Score mean = {:.3f}, std = {:.3f}'.format(np.mean(scores), np.std(scores)))
    
    return clfs, scores, n_trees

In [66]:
def cross_val_predict_skf_rm(clf, x_train, y_train, n_splits=3, 
                           n_repeats=2, random_state=42, verbose=False):
    '''
    Stratified KFold CV with repeated models
    '''
    scores = []
    n_trees = []
    clfs = []

    skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
    np.random.seed(random_state)

    for m in range(n_repeats):
        print('Repeat {}'.format(m))
        for n, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
            x_train_tmp, x_test_tmp = x_train.iloc[train_index], x_train.iloc[test_index]
            y_train_tmp, y_test_tmp = y_train.iloc[train_index], y_train.iloc[test_index]

            clf.random_state = np.random.randint(10000000)
            # print(clf.random_state)

            clf.fit(x_train_tmp, y_train_tmp, 
                    eval_set=[(x_test_tmp, y_test_tmp)], 
                    eval_metric=mcc_eval, early_stopping_rounds=10,
                    verbose=verbose)
            n_trees.append(clf.best_ntree_limit)
            scores.append(-clf.best_score)
            clfs.append(clf)
            print('Split {}, score = {:.3f}, n_best_trees = {}'.format(n, 
                scores[m*n_repeats+n], clf.best_ntree_limit))

    print('Score mean = {:.3f}, std = {:.3f}'.format(np.mean(scores), np.std(scores)))
    
    return clfs, scores, n_trees

In [11]:
# y_train = read_pickle('y_train.pickle')

# x_train = read_pickle('x_train_numeric_date_0.pickle')

In [12]:
important_features = pd.read_csv('important_numeric_features.csv', index_col=0, header=None)
important_features = list(important_features.values.ravel())
important_features.extend(['Id', 'Response'])

In [13]:
x_train_numeric = pd.read_csv('data/train_numeric.csv.zip', index_col=0, usecols=important_features)

  mask |= (ar1 == a)


In [14]:
y_train = x_train_numeric.Response
x_train_numeric.drop(['Response'], axis=1, inplace=True)

### Compare variance of CV score between repeated stratified KFold and stratified KFold with repeated models

In [67]:
n_estimators = 5
clf = XGBClassifier(max_depth=3, n_estimators=n_estimators, 
                    base_score=0.0058, n_jobs=-1, colsample_bytree=0.6,
                    min_child_weight=5, subsample=0.9,  
                    reg_lambda=4, silent=False, learning_rate=0.03)

In [71]:
# x_train0 = x_train[:10000]
# y_train0 = y_train[:10000]
clfs, scores, n_trees = cross_val_predict_skf_rm(clf, x_train_numeric, 
                                                   y_train, 
                                                   n_splits=3, n_repeats=2, 
                                                   random_state=42, 
                                                   verbose=True)

Repeat 0
[0]	validation_0-error:0.005811	validation_0-MCC:-0.167828
Multiple eval metrics have been passed: 'validation_0-MCC' will be used for early stopping.

Will train until validation_0-MCC hasn't improved in 10 rounds.
[1]	validation_0-error:0.005811	validation_0-MCC:-0.178757
[2]	validation_0-error:0.005811	validation_0-MCC:-0.192903
[3]	validation_0-error:0.005811	validation_0-MCC:-0.181634
[4]	validation_0-error:0.005811	validation_0-MCC:-0.196591
Split 0, score = 0.197, n_best_trees = 5
[0]	validation_0-error:0.005811	validation_0-MCC:-0.158103
Multiple eval metrics have been passed: 'validation_0-MCC' will be used for early stopping.

Will train until validation_0-MCC hasn't improved in 10 rounds.
[1]	validation_0-error:0.005811	validation_0-MCC:-0.166936
[2]	validation_0-error:0.005811	validation_0-MCC:-0.176835
[3]	validation_0-error:0.005811	validation_0-MCC:-0.181889
[4]	validation_0-error:0.005811	validation_0-MCC:-0.188277
Split 1, score = 0.188, n_best_trees = 5
[0]	v

In [78]:
clfs[0]

XGBClassifier(base_score=0.0058, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=5, missing=None, n_estimators=5,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=9524682, reg_alpha=0, reg_lambda=4, scale_pos_weight=1,
       seed=None, silent=False, subsample=0.9)

In [59]:
clf.fit(x_train_numeric, y_train, 
        eval_set=[(x_train_numeric, y_train)], 
        eval_metric=mcc_eval, early_stopping_rounds=10,
        verbose=True)

[0]	validation_0-error:0.005811	validation_0-MCC:-0.157591
Multiple eval metrics have been passed: 'validation_0-MCC' will be used for early stopping.

Will train until validation_0-MCC hasn't improved in 10 rounds.
[1]	validation_0-error:0.005811	validation_0-MCC:-0.178742
[2]	validation_0-error:0.005811	validation_0-MCC:-0.18582
[3]	validation_0-error:0.005811	validation_0-MCC:-0.191148
[4]	validation_0-error:0.005811	validation_0-MCC:-0.192053


XGBClassifier(base_score=0.0058, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=5, missing=None, n_estimators=5,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=4472471, reg_alpha=0, reg_lambda=4, scale_pos_weight=1,
       seed=None, silent=False, subsample=0.9)

In [62]:
clf.best_score

5

In [None]:
x_train0 = x_train[:10000]
y_train0 = y_train[:10000]
y_pred, scores, n_trees = cross_val_predict_rskf(clf, x_train0, y_train0, 
                                         roc_auc_score, n_splits=5, 
                                         n_repeats=3, random_state=42)

### Try on the real data
Too large and slow to run on desktop

In [13]:
n_estimators = 300
clf = XGBClassifier(max_depth=6, n_estimators=n_estimators, 
                    base_score=0.0058, n_jobs=-1, colsample_bytree=0.6,
                    min_child_weight=5, subsample=0.9,  
                    reg_lambda=4, silent=False, learning_rate=0.03)

In [14]:
# x_train0, x_test0, y_train0, y_test0 = train_test_split(x_train, y_train, train_size=0.2, shuffle=True, random_state=0)

In [15]:
y_pred, scores, n_trees = cross_val_predict_skf_rm(clf, x_train, y_train, 
                                         roc_auc_score, n_splits=5, 
                                         n_repeats=3, random_state=42, verbose=True)

Repeat 0
[0]	validation_0-auc:0.889378
Will train until validation_0-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.892141
[2]	validation_0-auc:0.892879
[3]	validation_0-auc:0.894438
[4]	validation_0-auc:0.896527
[5]	validation_0-auc:0.897012
[6]	validation_0-auc:0.89684
[7]	validation_0-auc:0.897673
[8]	validation_0-auc:0.897798
[9]	validation_0-auc:0.897688
[10]	validation_0-auc:0.898355
[11]	validation_0-auc:0.898582
[12]	validation_0-auc:0.898762
[13]	validation_0-auc:0.898726
[14]	validation_0-auc:0.898214
[15]	validation_0-auc:0.898434
[16]	validation_0-auc:0.898764
[17]	validation_0-auc:0.899088
[18]	validation_0-auc:0.899042
[19]	validation_0-auc:0.899004
[20]	validation_0-auc:0.899021
[21]	validation_0-auc:0.899192
[22]	validation_0-auc:0.899297
[23]	validation_0-auc:0.899521
[24]	validation_0-auc:0.8995
[25]	validation_0-auc:0.899742
[26]	validation_0-auc:0.899698
[27]	validation_0-auc:0.89979
[28]	validation_0-auc:0.899755
[29]	validation_0-auc:0.899708
[30]	valida

[34]	validation_0-auc:0.906551
[35]	validation_0-auc:0.906434


KeyboardInterrupt: 