In [1]:
import os
import numpy as np
import pandas as pd 
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#make sure you pip install sklearn_pandas (this is a very useful model)
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.feature_extraction import DictVectorizer

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')



In [2]:
%matplotlib inline

# Briefing on xgboost

XGBoost uses CART (Classification and Reggression tree)
 
     - each leaf always contains a real-valued score (logistic or classification)
     
     - can later be converted into categories for classification problems
     
Boosting - ensemble meta-algorithm used to convert many weka learners into a strong learner by decreasing variance. 

Cross validation in XGBoost - is robust method to estimate the performance of a model on unseen data. (Different from the scikit-learn's cv). xgboost generates many non-overlapping train/test splits on training data then finally reports the average test set performance across all data splits

DMatrix - makes data inot optimized data structure that xgb devs made to give the package its speedy attribute


# Tuning the model

boosting rounds (number of trees you build
    early stopping helps by automatically selecting the number of boosting rounds for you within xgb.cv()
    
how does early boosting work?
 tests the xgb model after ever boostingg round aggainst a hold out set and stopping the creation of additional rounds (finishing training of the model early) if the hold out metric (e.g. "rmse) does not improve for a given number of rounds. 
 
 note largest number of boosting rounds is 50 then stoping does not occur. 
 
### Some Hyperparmaters explanation

#### for trees
    boosting rounds
    learning rate (aka eta) larger value penalizes feature weights more strongly causing much stronger regularization
    gamma: min loss reduction to create new tree split
    lambda: L2 reg in leaf weights
    alpha: L1 reg on leaf weights
    lambda_bias: L2 reg on weights
    max_depth: max depth per tree
    subsample: % samples used per tree (underfitting if too low, overfitting if too high)
    colsample_bytree: % features used per tree (smaller provides additional regularization, bigger u run into overfitting)

# Loading data

In [3]:
path_to_kaggle_data = '~/Desktop/kaggle_competition/data/'

# Training data
app_train = pd.read_csv(path_to_kaggle_data + 'application_train.csv')
print('Training data shape: ', app_train.shape)

# Testing data features
app_test = pd.read_csv(path_to_kaggle_data + 'application_test.csv')
print('Testing data shape: ', app_test.shape)

Training data shape:  (307511, 122)
Testing data shape:  (48744, 121)


# Main functions (model_fit, grid_search, random_search)

In [9]:
def model_fit(model, dtrain, dtest, filename_output, useTrainCV=True, encoding = 'ohe', cv_folds=5, early_stopping_rounds=50):
    '''
    This preprocesses the data to hot_encodes and optimizes the data, trains the datam and 
    xgboost predicts
    
    model= xgb.XGBClassifier() ; here we use a binary classifier
    
    '''
    
    # Extract the ids
    train_ids = dtrain['SK_ID_CURR']
    test_ids = dtest['SK_ID_CURR']
    
    # Extract the labels for training
    labels = dtrain['TARGET']
    
    # Remove the ids and target
    train_features = dtrain.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = dtest.drop(columns = ['SK_ID_CURR'])
    
    # One Hot Encoding
    if encoding == 'ohe':
        train_features = pd.get_dummies(train_features)
        test_features = pd.get_dummies(test_features)
        
    # Align the dataframes by the columns
        train_features, test_features = train_features.align(test_features, join = 'inner', axis = 1)
        
    # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(train_features):
            if train_features[col].dtype == 'object':
                # Map the categorical features to integers
                train_features[col] = label_encoder.fit_transform(np.array(train_features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', train_features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    features = list(train_features.columns)
    
    # Preprocess data for xgb specifications
    if useTrainCV:
        xgb_params = model.get_xgb_params()
        xgb_train = xgb.DMatrix(train_features.values, label=labels.values)
        xgb_test = xgb.DMatrix(test_features.values)
        cv_result = xgb.cv(xgb_params, xgb_train, num_boost_round= model.get_params()['n_estimators'], 
                          nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) 
                          #show_progress=False)
        model.set_params(n_estimators=cv_result.shape[0])
    
    # Fit model to training set
    model.fit(train_features, labels, eval_metric = 'auc')
    
    # Predicting training set
    dtrain_pred = model.predict(train_features)
    dtrain_predprob = model.predict_proba(train_features)[:,1]
    
    # Predicting test set
    test_predictions = model.predict(test_features)
    # Uncomment if you have test_labels
    #test_predprob = model.predict_proba(test_features)[:,1]
    
    # Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(labels.values, dtrain_pred))
    print ("AUC Score: %f" % metrics.roc_auc_score(labels, dtrain_predprob))
    
    # Make DataFrame for submission
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    submission.to_csv(filename_output, index = False)
    '''
    if the test set had labels then we can use this
    
    # Predicting testing data:
    dtest['predprob'] = model.predict_proba(test_features)[:,1]
    results = test_results.merge(dtest[['SK_ID_CURR,'predprob']], on='SK_ID_CURR')
    print 'AUC Score (Test): %f' % metrics.roc_auc_score(test_labels, results['predprob'])
    '''   
  
    return model



def get_important_feat(model, feat_names):
    '''
    This function is to extract the feature importance
    '''
    from numpy import array
    imp_vals = model.booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}



# First version of xgb with data as is.

In [10]:
# Define version 1 xgb 
xgb_v1 = xgb.XGBClassifier(objective='binary:logistic',
                           learnig_rate=.1,
                           n_estimators=10,
                           min_child_weight=1,  # because high class imbalance
                           max_depth=5,
                           gamma=0,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           nthread=4,
                           scale_pos_weight=1,   # because high class imbalance
                           seed=27)

# Call the model_fit
model_fit(xgb_v1, app_train, app_test, filename_output='baseline_xgb.csv')

Training Data Shape:  (307511, 241)
Testing Data Shape:  (48744, 241)

Model Report
Accuracy : 0.9193
AUC Score: 0.736448


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learnig_rate=0.1, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=10, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8)

In [12]:
## We got a problem

df = pd.read_csv('baseline_xgb.csv')
df['TARGET'].value_counts()

0    48744
Name: TARGET, dtype: int64

# NOTE: Imbalance of data

So we have defaults represented by 1 accounting for only 8% of the training data. We need to resample

In [None]:
print(np.sum(app_train["TARGET"] == 1)/len(app_train["TARGET"]) *100 )

# Geovani's attempt to impute lol

In [None]:
# Create a boolean mask for categorical columns
categorical_feature_mask = features.dtypes == object

# Get list of categorical column names
categorical_columns = features.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = features.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )