In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

import warnings 
warnings.filterwarnings('ignore')

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
NFOLDS = 3
SEED = 0
NROWS = None

In [3]:
data = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')
prev = pd.read_csv('previous_application.csv')

In [4]:
categorical_feats = [f for f in data.columns if data[f].dtype == 'object']

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])  # pd.factorize() 는 2가지 값을 반환. 첫번째 값은 정수로 인코딩된 값, 두번째 값은 인코딩된 범주
    test[f_] = indexer.get_indexer(test[f_])  # 반환된 인덱서로 test 데이터에 동일한 기준으로 적용하여 값만 반환
    
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [5]:
prev_cat_features = [f_ for f_ in prev.columns if prev[f_].dtype == 'object']
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])  # _는 값을 무시하고 싶은 경우 사용
    
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()  # 그룹별 SK_ID_PREV의 count
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [6]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [7]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
        
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [8]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))  # X_train 행 개수 만큼 numpy 배열 생성
    oof_test = np.zeros((ntest,))  # X_test 행 개수 만큼 numpy 배열 생성
    oof_test_skf = np.empty((NFOLDS, ntest))  # 인덱스 번호와 컬렉션의 원소를 tuple 형태로 반환

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [9]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [10]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

In [11]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

0:	total: 225ms	remaining: 44.8s
1:	total: 280ms	remaining: 27.7s
2:	total: 329ms	remaining: 21.6s
3:	total: 385ms	remaining: 18.8s
4:	total: 440ms	remaining: 17.2s
5:	total: 490ms	remaining: 15.8s
6:	total: 537ms	remaining: 14.8s
7:	total: 579ms	remaining: 13.9s
8:	total: 627ms	remaining: 13.3s
9:	total: 678ms	remaining: 12.9s
10:	total: 728ms	remaining: 12.5s
11:	total: 787ms	remaining: 12.3s
12:	total: 851ms	remaining: 12.2s
13:	total: 920ms	remaining: 12.2s
14:	total: 987ms	remaining: 12.2s
15:	total: 1.04s	remaining: 12s
16:	total: 1.09s	remaining: 11.7s
17:	total: 1.15s	remaining: 11.6s
18:	total: 1.21s	remaining: 11.5s
19:	total: 1.26s	remaining: 11.3s
20:	total: 1.32s	remaining: 11.2s
21:	total: 1.38s	remaining: 11.2s
22:	total: 1.44s	remaining: 11.1s
23:	total: 1.5s	remaining: 11s
24:	total: 1.56s	remaining: 10.9s
25:	total: 1.62s	remaining: 10.8s
26:	total: 

28:	total: 1.73s	remaining: 10.2s
29:	total: 1.79s	remaining: 10.1s
30:	total: 1.84s	remaining: 10s
31:	total: 1.89s	remaining: 9.95s
32:	total: 1.94s	remaining: 9.83s
33:	total: 1.99s	remaining: 9.73s
34:	total: 2.04s	remaining: 9.61s
35:	total: 2.09s	remaining: 9.52s
36:	total: 2.14s	remaining: 9.45s
37:	total: 2.19s	remaining: 9.34s
38:	total: 2.24s	remaining: 9.26s
39:	total: 2.29s	remaining: 9.18s
40:	total: 2.36s	remaining: 9.16s
41:	total: 2.42s	remaining: 9.12s
42:	total: 2.49s	remaining: 9.09s
43:	total: 2.55s	remaining: 9.04s
44:	total: 2.6s	remaining: 8.95s
45:	total: 2.65s	remaining: 8.88s
46:	total: 2.7s	remaining: 8.79s
47:	total: 2.76s	remaining: 8.74s
48:	total: 2.82s	remaining: 8.7s
49:	total: 2.88s	remaining: 8.64s
50:	total: 2.94s	remaining: 8.59s
51:	total: 3s	remaining: 8.53s
52:	total: 3.07s	remaining: 8.52s
53:	total: 3.13s	remaining: 8.45s
54:	total: 3.18s	remaining: 8.39s
55:	total: 3.24s	remaining: 8.33s
56:	total: 3.3s	remaining: 8.27s
57:	total: 3.36s	remain

69:	total: 5.63s	remaining: 10.5s
70:	total: 5.7s	remaining: 10.4s
71:	total: 5.76s	remaining: 10.2s
72:	total: 5.83s	remaining: 10.1s
73:	total: 5.91s	remaining: 10.1s
74:	total: 5.98s	remaining: 9.97s
75:	total: 6.06s	remaining: 9.88s
76:	total: 6.15s	remaining: 9.82s
77:	total: 6.21s	remaining: 9.72s
78:	total: 6.27s	remaining: 9.61s
79:	total: 6.33s	remaining: 9.49s
80:	total: 6.39s	remaining: 9.38s
81:	total: 6.45s	remaining: 9.28s
82:	total: 6.5s	remaining: 9.17s
83:	total: 6.56s	remaining: 9.06s
84:	total: 6.63s	remaining: 8.97s
85:	total: 6.71s	remaining: 8.9s
86:	total: 6.78s	remaining: 8.81s
87:	total: 6.86s	remaining: 8.72s
88:	total: 6.93s	remaining: 8.64s
89:	total: 7s	remaining: 8.56s
90:	total: 7.09s	remaining: 8.49s
91:	total: 7.19s	remaining: 8.44s
92:	total: 7.27s	remaining: 8.37s
93:	total: 7.35s	remaining: 8.29s
94:	total: 7.43s	remaining: 8.21s
95:	total: 7.5s	remaining: 8.13s
96:	total: 7.58s	remaining: 8.05s
97:	total: 7.65s	remaining: 7.96s
98:	total: 7.72s	rema

In [12]:
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.2596954424083137
ET-CV: 0.26296502347137407
RF-CV: 0.2629856213006646
RF-CV: 0.331012765614397


In [13]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

(307511, 4),(48744, 4)


In [14]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train,y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False, float_format='%.8f')