In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
from math import sqrt
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [2]:
NFOLDS = 3
SEED = 0
NROWS = None

In [3]:
data = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
prev = pd.read_csv('./input/previous_application.csv')

In [4]:
categorical_feats = [f for f in data.columns if data[f].dtype == 'object']

In [5]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [6]:
gc.enable()

In [7]:
y_train = data['TARGET']
del data['TARGET']

In [8]:
prev_cat_features = [f_ for f_ in prev.columns if prev[f_].dtype == 'object']
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [9]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [10]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

In [11]:
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

In [12]:
ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [13]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

In [14]:
x_train = x_train[features]
x_test = x_test[features]

In [15]:
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

In [16]:
class SKlearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
    
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
    
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
    
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)
        
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [17]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [21]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'num_leaves': 123,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'max_depth': 15,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_split_gain': 0.01,
    'min_child_weight': 2
}

In [22]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SKlearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SKlearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = SKlearnWrapper(clf=CatBoostClassifier, seed=SEED, params=catboost_params)
lg = SKlearnWrapper(clf=LGBMClassifier, seed=SEED, params=lightgbm_params)

In [23]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0:	total: 183ms	remaining: 36.5s
1:	total: 206ms	remaining: 20.4s
2:	total: 226ms	remaining: 14.8s
3:	total: 248ms	remaining: 12.1s
4:	total: 270ms	remain

184:	total: 4.37s	remaining: 354ms
185:	total: 4.39s	remaining: 330ms
186:	total: 4.41s	remaining: 307ms
187:	total: 4.44s	remaining: 283ms
188:	total: 4.46s	remaining: 259ms
189:	total: 4.48s	remaining: 236ms
190:	total: 4.5s	remaining: 212ms
191:	total: 4.53s	remaining: 189ms
192:	total: 4.55s	remaining: 165ms
193:	total: 4.57s	remaining: 141ms
194:	total: 4.6s	remaining: 118ms
195:	total: 4.62s	remaining: 94.3ms
196:	total: 4.64s	remaining: 70.7ms
197:	total: 4.67s	remaining: 47.1ms
198:	total: 4.69s	remaining: 23.6ms
199:	total: 4.71s	remaining: 0us
0:	total: 23.9ms	remaining: 4.75s
1:	total: 45.2ms	remaining: 4.47s
2:	total: 66.3ms	remaining: 4.36s
3:	total: 88.4ms	remaining: 4.33s
4:	total: 112ms	remaining: 4.35s
5:	total: 134ms	remaining: 4.33s
6:	total: 156ms	remaining: 4.31s
7:	total: 182ms	remaining: 4.36s
8:	total: 203ms	remaining: 4.32s
9:	total: 224ms	remaining: 4.26s
10:	total: 245ms	remaining: 4.21s
11:	total: 266ms	remaining: 4.17s
12:	total: 289ms	remaining: 4.16s
13:	

26:	total: 611ms	remaining: 3.91s
27:	total: 639ms	remaining: 3.93s
28:	total: 660ms	remaining: 3.89s
29:	total: 683ms	remaining: 3.87s
30:	total: 706ms	remaining: 3.85s
31:	total: 725ms	remaining: 3.81s
32:	total: 747ms	remaining: 3.78s
33:	total: 769ms	remaining: 3.75s
34:	total: 791ms	remaining: 3.73s
35:	total: 812ms	remaining: 3.7s
36:	total: 835ms	remaining: 3.68s
37:	total: 858ms	remaining: 3.66s
38:	total: 882ms	remaining: 3.64s
39:	total: 902ms	remaining: 3.61s
40:	total: 926ms	remaining: 3.59s
41:	total: 947ms	remaining: 3.56s
42:	total: 968ms	remaining: 3.54s
43:	total: 993ms	remaining: 3.52s
44:	total: 1.01s	remaining: 3.5s
45:	total: 1.04s	remaining: 3.47s
46:	total: 1.06s	remaining: 3.45s
47:	total: 1.08s	remaining: 3.42s
48:	total: 1.1s	remaining: 3.4s
49:	total: 1.13s	remaining: 3.39s
50:	total: 1.15s	remaining: 3.37s
51:	total: 1.17s	remaining: 3.34s
52:	total: 1.19s	remaining: 3.31s
53:	total: 1.22s	remaining: 3.29s
54:	total: 1.24s	remaining: 3.27s
55:	total: 1.26s	r

In [24]:
print('XG-CV: {}'.format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print('XG-CV: {}'.format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print('XG-CV: {}'.format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print('XG-CV: {}'.format(sqrt(mean_squared_error(y_train, xg_oof_train))))

XG-CV: 0.25969546149690725
XG-CV: 0.25969546149690725
XG-CV: 0.25969546149690725
XG-CV: 0.25969546149690725


In [25]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

In [26]:
print('{}, {}'.format(x_train.shape, x_test.shape))

(307511, 4), (48744, 4)


In [27]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

LogisticRegression()

In [28]:
test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

In [29]:
test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False, float_format='%.8f')