In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.cross_validation import KFold
from sklearn import linear_model, datasets
from keras import models
from keras import layers
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [2]:
os.chdir('/home/luke/Desktop/braviant_datachallenge/')
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv',header=None)
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv',header= None)

In [3]:
x_train = x_train.drop(['Unnamed: 0'],axis = 1)
x_test = x_test.drop(['Unnamed: 0'],axis = 1)

In [4]:
x_origin_colnames = list(x_train.columns)

In [5]:
y_train = y_train.iloc[:,1]

Basic hyperparameter tunning

In [39]:
#ada_params = {
#    'n_estimators': [800,1000,1200],
#    'learning_rate' : [0.5,0.8]
#}
#%%time

#mdl = AdaBoostClassifier()

#grid = GridSearchCV(mdl, ada_params,
#                    verbose=1,
#                    cv=5,
#                    n_jobs=2,
#                   scoring='roc_auc')
#grid.fit(x_train, y_train)

#print(grid.best_params_)
#print(grid.best_score_)

#rf_params = {
#    'n_jobs': [-1],
#    'n_estimators': [500,700],
#     'warm_start': [True], 
     #'max_features': [0.2,0.5],
#    'max_depth': [2,6,8],
#    'min_samples_leaf': [2,6,10],
#    'max_features' : ['sqrt'],
#    'verbose': [0]
#}
#%%time
#mdl = RandomForestClassifier()
#grid = GridSearchCV(mdl, rf_params,
#                    verbose=1,
#                    cv=5,
#                    n_jobs=2,
#                   scoring='roc_auc')
#grid.fit(x_train, y_train)

#print(grid.best_params_)
#print(grid.best_score_)

#gb_params = {
#    'n_estimators': [1000],
     #'max_features': 0.2,
#    'max_depth': [10],
#    'min_samples_leaf': [2],
#    'verbose': [0]
#}

#%%time
#mdl = GradientBoostingClassifier()
#grid = GridSearchCV(mdl, gb_params,
#                    verbose=1,
#                    cv=5,
#                    n_jobs=2,
#                   scoring='roc_auc')
#grid.fit(x_train, y_train)
#print(grid.best_params_)
#print(grid.best_score_)

In [6]:
x_train = x_train.values
x_test = x_test.values
y_train = y_train.ravel()

Here we use OOP to make modeling,stacking pipline handy 

In [7]:
ntrain = x_train.shape[0]
ntest = x_test.shape[0]
SEED = 0 
NFOLDS = 5 
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

class modelHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    ''' we use proba to scoring each case for dense evaluation(from categorical to continious)'''
    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [8]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        #oof_train[test_index] = clf.predict(x_te)
        #oof_test_skf[i, :] = clf.predict(x_test)
        
        
        oof_train[test_index] = clf.predict_proba(x_te)[:,1]
        oof_test_skf[i, :] = clf.predict_proba(x_test)[:,1]

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [9]:

# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 700,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}


# AdaBoost parameters
ada_params = {
    'n_estimators': 1200,
    'learning_rate' : 0.8
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}


In [10]:
rf = modelHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
ada = modelHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = modelHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)

Round1: scoring using different methods

In [11]:
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

  warn("Warm-start fitting without increasing n_estimators does not "


In [12]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

Unnamed: 0,RandomForest,AdaBoost,GradientBoost
0,0.82523,0.501969,0.999603
1,0.939479,0.501963,0.994406
2,0.474789,0.500101,0.311726
3,0.323611,0.500234,0.908021
4,0.861045,0.501301,0.995707


In [13]:
#Reserve a full variable dataframe for lr
x_train_use = x_train
x_test_use = x_test

In [14]:
x_train = np.concatenate(( rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [15]:
x_train_use = np.concatenate((x_train_use,x_train),axis =1)
x_test_use = np.concatenate((x_test_use,x_test),axis =1)

Round2: Either lgbm(performance base) or LR(business base) then optional(NN)

In [16]:
# Create parameters to search
lgbm_params = {
    'learning_rate': [0.3],
    'n_estimators': [400],
    'num_leaves': [2],
    'boosting_type' : ['dart'],
    'objective' : ['binary'],
    'random_state' : [666], # Updated from 'seed'
    'colsample_bytree' : [0.5],
    'subsample' : [0.3],
    'reg_alpha' : [0.35],
    'reg_lambda' : [0.5],
    'silent' : ['True']
    }

In [17]:
%%time

mdl = lgb.LGBMClassifier()

grid= GridSearchCV(mdl, lgbm_params,
                    verbose=1,
                    cv=5,
                    n_jobs=4,
                   scoring='roc_auc')
grid.fit(x_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'boosting_type': 'dart', 'colsample_bytree': 0.5, 'learning_rate': 0.3, 'n_estimators': 400, 'num_leaves': 2, 'objective': 'binary', 'random_state': 666, 'reg_alpha': 0.35, 'reg_lambda': 0.5, 'silent': 'True', 'subsample': 0.3}
0.9750595128597904
CPU times: user 1.49 s, sys: 18.9 ms, total: 1.51 s
Wall time: 5 s


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    4.8s finished


In [18]:
y_test_lgbm = grid.predict(x_test)

  if diff:


In [19]:
roc_auc_score(y_test.iloc[:,1], y_test_lgbm)

0.9362627632206008

In [40]:
# Create parameters to search
lr_params = {
    'penalty': ['l1','l2'],
    'C': np.logspace(0,4,10),
    }

In [41]:
%%time

mdl = linear_model.LogisticRegression()

grid = GridSearchCV(mdl, lr_params,
                    verbose=1,
                    cv=5,
                    n_jobs=4,
                   scoring='roc_auc')
grid.fit(x_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'C': 7.742636826811269, 'penalty': 'l1'}
0.9733913357526235
CPU times: user 111 ms, sys: 73.4 ms, total: 184 ms
Wall time: 1.95 s


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.8s finished


In [42]:
y_test_lr = grid.predict(x_test)

In [43]:
roc_auc_score(y_test.iloc[:,1], y_test_lr)

0.9340675434657613

In [None]:
#Create a original variable dataframe with different algorithm scoring

In [44]:
lr_use_x_train = np.concatenate((x_train_use,x_train),axis = 1)
lr_use_x_test = np.concatenate((x_test_use,x_test),axis = 1)
lr_use_x_train = pd.DataFrame(lr_use_x_train)
lr_use_x_test = pd.DataFrame(lr_use_x_test)
x_origin_colnames.extend(('sc1','sc2','sc3'))
lr_use_x_train.columns = [x_origin_colnames]
lr_use_x_test.columns = [x_origin_colnames]

In [45]:
lr_explain = linear_model.LogisticRegression()
lr_explain.fit(lr_use_x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
lr_explain.coef_[0]

array([-7.30600451e-04,  1.78333076e-01, -3.13424073e-03, -8.80246200e-01,
       -1.26461457e-01,  1.20681348e-02, -1.03453592e-02,  9.35449023e-02,
       -4.52074466e-02,  2.40393043e+00, -9.02180780e-01,  3.27031468e+00,
        2.40393043e+00, -9.02180780e-01,  3.27031468e+00])

In [47]:
Parameter_tbl = pd.DataFrame( {'Feature':x_origin_colnames ,
     'Parameter': lr_explain.coef_[0]
    })


In [48]:
Parameter_tbl.sort_values(by=['Parameter'])

Unnamed: 0,Feature,Parameter
10,sc2,-0.902181
13,sc2,-0.902181
3,x24,-0.880246
4,x25,-0.126461
8,bin_positive,-0.045207
6,x27,-0.010345
2,x2,-0.003134
0,x0,-0.000731
5,x26,0.012068
7,bin_negative,0.093545


In [49]:
y_test_lr_explain = lr_explain.predict(lr_use_x_test)

In [50]:
roc_auc_score(y_test.iloc[:,1], y_test_lr_explain)

0.933734650377313

In [51]:
%%time
# Function to create model, required for KerasClassifier
def create_model(optimizer, init,reg):
    model = models.Sequential()
    model.add(layers.Dense(64, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg),input_shape=(3,)))
    model.add(layers.Dense(64, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg),activation='relu'))
    model.add(layers.Dense(1, kernel_initializer=init, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])
    return(model)
model = KerasClassifier(build_fn=create_model, verbose=0)

# grid search epochs, batch size and optimizer
optimizers = [ 'adam']
init = ['normal']
reg = [0.01]
epochs = [20]
batches = [500]

param_grid = dict(optimizer = optimizers, init = init,epochs = epochs,batch_size = batches,reg =reg)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc')
grid_result = grid.fit(x_train, y_train)

CPU times: user 4.94 s, sys: 353 ms, total: 5.29 s
Wall time: 3.7 s


In [52]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.971674 using {'batch_size': 500, 'epochs': 20, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.01}
0.971674 (0.001948) with: {'batch_size': 500, 'epochs': 20, 'init': 'normal', 'optimizer': 'adam', 'reg': 0.01}


In [53]:
y_test_nn = grid.predict(x_test)

In [54]:
roc_auc_score(y_test.iloc[:,1], y_test_nn)

0.9327408041396494