# XGBoost starter
Train XGBoost models on the three datasets to get an idea for basic performance and important features.

In [71]:
import os
import time
import pickle
import datetime

from string import lstrip


import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, vstack, hstack

from sklearn.cross_validation import StratifiedKFold

import c

import xgboost as xgb
from xgboost import XGBClassifier


%matplotlib inline

In [32]:
folds = StratifiedKFold(ytrain, n_folds=3, shuffle=True, random_state=123)

In [33]:
params = {'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
#params['subsample'] = 0.5
#params['colsample_bytree']= 0.3
#params['min_child_weight'] = 1
#params['colsample_bylevel']= 0.9
#params['gamma'] = 0.005
#params['max_depth'] = 3
#params['booster'] = "gbtree"
#params['seed'] = 1712
params['num_round'] = 50
params['early_stopping'] = 20
params['base_score'] = 0.005
params['eta'] = 0.05

In [68]:
def run_xgboost_cv(Xtrain, ytrain, params, folds):

    for (itrain, ival) in folds:

        d_train = xgb.DMatrix(Xtrain[itrain,:], label=ytrain[itrain])
        d_val = xgb.DMatrix(Xtrain[ival,:], label=ytrain[ival])

        watchlist = [(d_train,'train'), (d_val, 'valid')]

        eval_result = {}

        bst = xgb.train(params, 
                    d_train, 
                    num_boost_round=params['num_round'], 
                    evals=watchlist,
                    evals_result=eval_result,
                    early_stopping_rounds=params['early_stopping'],
                    verbose_eval=50)
        
        # Print scores
        cv_score = eval_result['valid']['auc'][bst.best_iteration]
        train_score = eval_result['train']['auc'][bst.best_iteration]
        
        print('Train score: {} | CV Score: {}'.format(train_score, cv_score))
        
        # Print feature importance
        feature_importance = pd.DataFrame.from_dict(bst.get_fscore(), orient='index')
        feature_importance.columns = ['f_score']
        
        # Get csv file columns name for each column XGBoost column identifier
        feature_importance.index = [fnames[int(lstrip(x,'f'))] for x in feature_importance.index.values]

        print feature_importance.sort_values('f_score', ascending=False).head(10)

In [69]:
data_files = ['train_categorical_to_num.pkl',
              'train_numeric.pkl',
              'train_date.pkl']

In [70]:
for f in data_files:
    print('Testing {}'.format(f))

    with open(os.path.join(c.BASE_PATH,f),'rb') as f:
        data=pickle.load(f)
        
    Xtrain = data['data']['features']
    ytrain = data['data']['y'].loc[data['data']['ids'].Id].Response.values
    fnames = data['data']['feature_names']
    
    run_xgboost_cv(Xtrain, ytrain, params, folds)

Testing train_categorical_to_num.pkl
[0]	train-auc:0.608929	valid-auc:0.599413
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[50]	train-auc:0.629222	valid-auc:0.615628
Stopping. Best iteration:
[47]	train-auc:0.629242	valid-auc:0.616448

Train score: 0.629242 | CV Score: 0.616448
              f_score
L3_S32_F3853      306
L2_S28_F3314      121
L1_S25_F3013      113
L2_S27_F3191       83
L2_S26_F3098       82
L2_S26_F3128       72
L1_S24_F1827       61
L1_S24_F1582       60
L3_S47_F4139       59
L1_S24_F1185       56
[0]	train-auc:0.606284	valid-auc:0.605706
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[50]	train-auc:0.627935	valid-auc:0.621211
Train score: 0.629964 | CV Score: 0.622421
              f_score
L3_S32_F3853      337
L2_S26_F3098      181
L1_S25_F3013      156
L2_S27_F3191      156