In [1]:
import os
if os.name=='nt':
    mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
    os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate, cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sys import getsizeof
import time
import gc

%matplotlib inline



In [2]:
import pickle

def save_pickle(x, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pickle(filename):
    with open(filename, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [3]:
def cross_val_predict_rskf(clf, x_train, y_train, score_callable, n_splits=3, 
                           n_repeats=2, random_state=42, verbose=False):
    '''
    Repeated stratified KFold CV, returns predictions for 
    each repeat and average score.
    n_repeats: repetitions of CV
    '''
    y_pred = []
    scores = []
    n_trees = []

    rskf = RepeatedStratifiedKFold(n_repeats=n_repeats, n_splits=n_splits, 
                                   random_state=0)
    np.random.seed(random_state)
    for n, (train_index, test_index) in enumerate(rskf.split(x_train, y_train)):
        x_train_tmp, x_test_tmp = x_train[train_index], x_train[test_index]
        y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]

        clf.random_state = np.random.randint(10000000)

        clf.fit(x_train_tmp, y_train_tmp, 
                eval_set=[(x_test_tmp, y_test_tmp)], 
                eval_metric='auc', early_stopping_rounds=30,
                verbose=verbose)
        y_pred_tmp = clf.predict_proba(x_test_tmp)[:, 1]
        y_pred.append(y_pred_tmp)
        scores.append(score_callable(y_test_tmp, y_pred_tmp))
        n_trees.append(clf.best_ntree_limit)
        print('Split {}, score = {:.3f}, best_ntree_limit = {}'.format(n, scores[n], clf.best_ntree_limit))

    y_pred = np.concatenate([np.concatenate([y_pred[n_splits*i+j] 
                             for j in range(n_splits)]).reshape((-1, 1))
                             for i in range(n_repeats)], axis=1)
    print('Score mean = {:.3f}, std = {:.3f}'.format(np.mean(scores), np.std(scores)))
    
    return y_pred, scores, n_trees

In [4]:
def cross_val_predict_skf_rm(clf, x_train, y_train, score_callable, n_splits=3, 
                           n_repeats=2, random_state=42, verbose=False):
    '''
    Stratified KFold CV with repeated models
    '''
    y_pred_all = []
    scores = []
    n_trees = []

    skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
    np.random.seed(random_state)

    for m in range(n_repeats):
        y_pred = []
        print('Repeat {}'.format(m))
        for n, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
            x_train_tmp, x_test_tmp = x_train[train_index], x_train[test_index]
            y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]

            clf.random_state = np.random.randint(10000000)
            # print(clf.random_state)

            clf.fit(x_train_tmp, y_train_tmp, 
                    eval_set=[(x_test_tmp, y_test_tmp)], 
                    eval_metric='auc', early_stopping_rounds=30,
                    verbose=verbose)
            n_trees.append(clf.best_ntree_limit)
            y_pred_tmp = clf.predict_proba(x_test_tmp)[:, 1]
            y_pred.append(y_pred_tmp)
            scores.append(score_callable(y_test_tmp, y_pred_tmp))
            print('Split {}, score = {:.3f}, n_best_trees = {}'.format(n, 
                scores[m*n_repeats+n], clf.best_ntree_limit))
        y_pred_all.append(np.concatenate(y_pred).reshape((-1, 1)))
        
    y_pred_all = np.concatenate(y_pred_all, axis=1)
    print('Score mean = {:.3f}, std = {:.3f}'.format(np.mean(scores), np.std(scores)))
    
    return y_pred_all, scores, n_trees

In [5]:
y_train = read_pickle('y_train.pickle')

x_train = read_pickle('x_train_numeric_date_0.pickle')

### Compare variance of CV score between repeated stratified KFold and stratified KFold with repeated models

In [6]:
n_estimators = 200
clf = XGBClassifier(max_depth=6, n_estimators=n_estimators, 
                    base_score=0.0058, n_jobs=-1, colsample_bytree=0.6,
                    min_child_weight=5, subsample=0.9,  
                    reg_lambda=4, silent=False, learning_rate=0.03)

In [None]:
x_train0 = x_train[:10000]
y_train0 = y_train[:10000]
y_pred, scores, n_trees = cross_val_predict_skf_rm(clf, x_train0, y_train0, 
                                         roc_auc_score, n_splits=5, 
                                         n_repeats=3, random_state=42)

In [None]:
x_train0 = x_train[:10000]
y_train0 = y_train[:10000]
y_pred, scores, n_trees = cross_val_predict_rskf(clf, x_train0, y_train0, 
                                         roc_auc_score, n_splits=5, 
                                         n_repeats=3, random_state=42)

### Try on the real data
Too large and slow to run on desktop

In [13]:
n_estimators = 300
clf = XGBClassifier(max_depth=6, n_estimators=n_estimators, 
                    base_score=0.0058, n_jobs=-1, colsample_bytree=0.6,
                    min_child_weight=5, subsample=0.9,  
                    reg_lambda=4, silent=False, learning_rate=0.03)

In [14]:
# x_train0, x_test0, y_train0, y_test0 = train_test_split(x_train, y_train, train_size=0.2, shuffle=True, random_state=0)

In [15]:
y_pred, scores, n_trees = cross_val_predict_skf_rm(clf, x_train, y_train, 
                                         roc_auc_score, n_splits=5, 
                                         n_repeats=3, random_state=42, verbose=True)

Repeat 0
[0]	validation_0-auc:0.889378
Will train until validation_0-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.892141
[2]	validation_0-auc:0.892879
[3]	validation_0-auc:0.894438
[4]	validation_0-auc:0.896527
[5]	validation_0-auc:0.897012
[6]	validation_0-auc:0.89684
[7]	validation_0-auc:0.897673
[8]	validation_0-auc:0.897798
[9]	validation_0-auc:0.897688
[10]	validation_0-auc:0.898355
[11]	validation_0-auc:0.898582
[12]	validation_0-auc:0.898762
[13]	validation_0-auc:0.898726
[14]	validation_0-auc:0.898214
[15]	validation_0-auc:0.898434
[16]	validation_0-auc:0.898764
[17]	validation_0-auc:0.899088
[18]	validation_0-auc:0.899042
[19]	validation_0-auc:0.899004
[20]	validation_0-auc:0.899021
[21]	validation_0-auc:0.899192
[22]	validation_0-auc:0.899297
[23]	validation_0-auc:0.899521
[24]	validation_0-auc:0.8995
[25]	validation_0-auc:0.899742
[26]	validation_0-auc:0.899698
[27]	validation_0-auc:0.89979
[28]	validation_0-auc:0.899755
[29]	validation_0-auc:0.899708
[30]	valida

[34]	validation_0-auc:0.906551
[35]	validation_0-auc:0.906434


KeyboardInterrupt: 