In [37]:
import os
if os.name=='nt':
    mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
    os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate, cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sys import getsizeof
import time
import gc

%matplotlib inline

In [2]:
import pickle

def save_pickle(x, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pickle(filename):
    with open(filename, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [3]:
x_train = pd.read_csv('data/train_numeric.csv.zip', nrows=50000, index_col=0)
# x_test = pd.read_csv('data/test_numeric.csv.zip', nrows=1000, index_col=0)

In [4]:
y_train = x_train['Response']
x_train.drop(['Response'], axis=1, inplace=True)

In [5]:
x_train = x_train.values
y_train = y_train.values.ravel()
# x_test = x_test.values

In [6]:
rskf = RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=0)
# for train_index, test_index in rskf.split(x_train, y_train):
#     x_train_tmp, x_test_tmp = x_train[train_index], x_train[test_index]
#     y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]

In [7]:
y_train.sum()

271

In [9]:
n_estimators = 4
clf = XGBClassifier(max_depth=3, n_estimators=n_estimators, 
                    base_score=0.0058, n_jobs=8, colsample_bytree=0.6,
                   min_child_weight=5, subsample=0.9,  
                    reg_lambda=4, silent=False, learning_rate=0.03)

In [17]:
cv_results = cross_validate(clf, x_train, y_train, scoring='roc_auc', 
                            cv=rskf, n_jobs=-1, return_train_score=True)

In [32]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_predict = cross_val_predict(clf, x_train, y_train, cv=skf, n_jobs=-1, 
                              method='predict_proba')

In [120]:
def cross_val_predict_rskf(clf, x_train, y_train, score_callable, n_splits=3, 
                           n_repeats=2, random_state=42):
    '''
    Repeated stratified KFold CV, returns predictions for 
    each repeat and average score.
    n_repeats: repetitions of CV
    '''
    y_pred = []
    scores = []

    n_splits = 3
    n_repeats = 2

    rskf = RepeatedStratifiedKFold(n_repeats=n_repeats, n_splits=n_splits, 
                                   random_state=0)
    np.random.seed(random_state)
    for n, (train_index, test_index) in enumerate(rskf.split(x_train, y_train)):
        x_train_tmp, x_test_tmp = x_train[train_index], x_train[test_index]
        y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]

        clf.random_state = np.random.randint(10000000)

        clf.fit(x_train_tmp, y_train_tmp)
        y_pred_tmp = clf.predict_proba(x_test_tmp)[:, 1]
        y_pred.append(y_pred_tmp)
        scores.append(score_callable(y_test_tmp, y_pred_tmp))
        # print('Split {}, score = {:.3f}'.format(n, scores[n]))

    y_pred = np.concatenate([np.concatenate([y_pred[n_splits*i+j] 
                             for j in range(n_splits)]).reshape((-1, 1))
                             for i in range(n_repeats)], axis=1)
    
    score = np.mean(scores)
    
    return y_pred, score

In [121]:
y_pred, score = cross_val_predict_rskf(clf, x_train, y_train, roc_auc_score)

In [131]:
def cross_val_predict_skf_rm(clf, x_train, y_train, score_callable, n_splits=3, 
                           n_repeats=2, random_state=42):
    '''
    Stratified KFold CV with repeated models
    '''
    y_pred_all = []
    scores = []

    n_splits = 3
    n_repeats = 2

    skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
    np.random.seed(random_state)

    for m in range(n_repeats):
        y_pred = []
        # print('Repeat {}'.format(m))
        for n, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
            x_train_tmp, x_test_tmp = x_train[train_index], x_train[test_index]
            y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]

            clf.random_state = np.random.randint(10000000)
            # print(clf.random_state)

            clf.fit(x_train_tmp, y_train_tmp)
            y_pred_tmp = clf.predict_proba(x_test_tmp)[:, 1]
            y_pred.append(y_pred_tmp)
            scores.append(score_callable(y_test_tmp, y_pred_tmp))
            # print('Split {}, score = {:.3f}'.format(n, scores[m*n_repeats+n]))
        y_pred_all.append(np.concatenate(y_pred).reshape((-1, 1)))
        
    y_pred_all = np.concatenate(y_pred_all, axis=1)
    score = np.mean(scores)
    
    return y_pred_all, score

In [132]:
y_pred, score = cross_val_predict_skf_rm(clf, x_train, y_train, roc_auc_score)

In [133]:
y_pred.shape

(50000, 2)

### What is the difference between `shuffle=True` or `False`?

In [128]:
from sklearn.model_selection import KFold
rskf = KFold(n_splits=3, random_state=0, shuffle=True)
train = np.arange(10)
test = np.arange(10)
for n in range(3):
    print('='*10)
    for train_index, test_index in rskf.split(train, test):
        print(train_index, test_index)

[0 1 3 5 6 7] [2 4 8 9]
[0 2 3 4 5 8 9] [1 6 7]
[1 2 4 6 7 8 9] [0 3 5]
[0 1 3 5 6 7] [2 4 8 9]
[0 2 3 4 5 8 9] [1 6 7]
[1 2 4 6 7 8 9] [0 3 5]
[0 1 3 5 6 7] [2 4 8 9]
[0 2 3 4 5 8 9] [1 6 7]
[1 2 4 6 7 8 9] [0 3 5]
