In [1]:
# Standard libraries
import os
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns


# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import Pipeline

from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

# make sure you pip install sklearn_pandas (this is a very useful model)
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.feature_extraction import DictVectorizer

# UDF
import basic_application_data_cleaner as cleaner

%matplotlib inline



# Load data

In [2]:
path_to_kaggle_data='~/kaggle_JPFGM/Data/'  # location of all the unzipped data files on local machine# Training data
app_train, app_test = cleaner.read_raw_application_data(path_to_kaggle_data)
df_train, df_test = cleaner.wrangle_application_train_test_data(app_train, app_test)

Raw training data shape:  (307511, 122)
Raw testing data shape:  (48744, 121)
Cleaned training data shape:  (307511, 247)
Cleaned testing data shape:  (48744, 246)


# Model selection

In [3]:
def fit_model(model, app_data, split=None, verbose=False):
    """
    Parameters
    ----------
    model: sklearn model to be fitted to app_data
    app_data: dataframe, subset of the application training data set, containing columns 
              'SK_ID_CURR' and 'TARGET'
    
    split: string, default None.
           - If None, train model on app_data.
           - If 'train_test_split', use sklearn.preprocessing.train_test_split to split app_data into a train and test set
    
    verbose: booean, default False
            whether or not to print comments during model fitting
    """
    df = app_data.copy()
    
    X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    y = df['TARGET']
    
    # Create train and test data according to specification
    if split == 'train_test_split':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    else:
        X_train = X
        y_train = y

    # Note that X, y, X_train, X_test, y_train, y_test are all dataframes
    if verbose:
        print('Size of training data split:', X_train.shape)
        print('Size of test data split:', X_test.shape)
        print('Number of positive labels in training data split:', sum(y_train))
        print('Number of positive labels in test data split:', sum(y_test))
    
    # Fit model on training data
    # model needs to have this kind of method and kwargs
    model.fit(X_train, y_train, eval_metric = 'auc')
    # TOOD: does this create a view to the old 

    # Evaluate model on training data
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:,1]

    accur_train = metrics.accuracy_score(y_train, y_pred_train)
    auc_train = metrics.roc_auc_score(y_train, y_proba_train)
    
    # TODO: output confusion matrix, more performance metrics, and AUC curve plot        

    if verbose:
        print('\nIn sample performance:')
        print ("  Accuracy on train set : %.3f" % accur_train)
        print ("  AUC on train set: %.3f" % auc_train)
        print('Number of predicted positive labels in training data split:', sum(y_pred_train))
    
    # Evaluate model on test data if available
    if split:
        y_pred_test = model.predict(X_test)
        y_proba_test = model.predict_proba(X_test)[:,1]
            
        accur_test = metrics.accuracy_score(y_test, y_pred_test)
        auc_test = metrics.roc_auc_score(y_test, y_proba_test)

        if verbose:
            print('\nOut of sample performance:')
            print ("  Accuracy on train set: %.3f" % accur_test)
            print ("  AUC on test set: %.3f" % auc_test)
            print('Number of predicted positive labels in test data split:', sum(y_pred_test))
    # Is the input model changed already?
    return model

## Tune parameter: num_estimators

In [4]:
model = xgb.XGBClassifier(objective='binary:logistic',
                           learnig_rate=.1,
                           min_child_weight=1,  # because high class imbalance
                           max_depth=5,
                           gamma=0,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           nthread=4,
                           scale_pos_weight=1,   # because high class imbalance
                           seed=27)  # num_estimators = 100 by default

In [5]:
xgb_params = model.get_xgb_params()
xgb_train = xgb.DMatrix(df_train.drop(['SK_ID_CURR', 'TARGET'], axis=1).values,
                        df_train['TARGET'].values)

cv_result = xgb.cv(xgb_params,
                   xgb_train,
                   num_boost_round=model.get_params()['n_estimators'], 
                   nfold=5,
                   metrics='auc',
                   early_stopping_rounds=50) 
                  #show_progress=False

In [6]:
cv_result

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.699312,0.010719,0.704158,0.011212
1,0.712421,0.001347,0.716961,0.001704
2,0.717529,0.004410,0.722780,0.004683
3,0.721037,0.003776,0.726051,0.004122
4,0.722524,0.002709,0.728107,0.003322
5,0.725357,0.002329,0.730992,0.001718
6,0.725601,0.001918,0.731457,0.002400
7,0.725414,0.001643,0.731520,0.002363
8,0.726785,0.001595,0.732955,0.001664
9,0.726874,0.001319,0.733250,0.001385


In [7]:
# model selection
# model.set_params(n_estimators=cv_result.shape[0])

# Predict using selected model

## Estimate OOS performance

In [8]:
model_select = xgb.XGBClassifier(objective='binary:logistic',
                           learnig_rate=.1,
                           min_child_weight=1,  # because high class imbalance
                           max_depth=5,
                           gamma=0,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           nthread=4,
                           scale_pos_weight=1,   # because high class imbalance
                           seed=27)  # num_estimators = 100 by default

# print performance using train_test_split
fit_model(model_select, df_train, split = 'train_test_split', verbose=True)

Size of training data split: (206032, 245)
Size of test data split: (101479, 245)
Number of positive labels in training data split: 16708
Number of positive labels in test data split: 8117

In sample performance:
  Accuracy on train set : 0.920
  AUC on train set: 0.785
Number of predicted positive labels in training data split: 561

Out of sample performance:
  Accuracy on train set: 0.920
  AUC on test set: 0.757
Number of predicted positive labels in test data split: 241


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learnig_rate=0.1, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8)

## Generate predictions on final test set

In [13]:
def make_predictions(model, app_test, filename_output=None):
    df_test = app_test.copy()
    X_test = df_test.drop(['SK_ID_CURR'], axis=1)
    y_pred_test = model.predict(X_test)

    # Make DataFrame for submission
    submission = pd.DataFrame({'SK_ID_CURR': df_test['SK_ID_CURR'], 'TARGET': y_pred_test})
    
    if filename_output is None:
        submission.to_csv(filename_output, index = False)
    return submission

In [14]:
# default num_estimators = 100
model_select = xgb.XGBClassifier(objective='binary:logistic',
                           learnig_rate=.1,
                           min_child_weight=1,  # because high class imbalance
                           max_depth=5,
                           gamma=0,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           nthread=4,
                           scale_pos_weight=1,   # because high class imbalance
                           seed=27)

In [15]:
model_select = fit_model(model_select, df_train)

In [16]:
df_submit = make_predictions(model_select, df_test, filename_output=None)

In [18]:
df_submit['TARGET'].value_counts()

0    48682
1       62
Name: TARGET, dtype: int64