# Stacking Test-Sklearn, XGBoost, CatBoost, LightGBM

참고 : https://www.kaggle.com/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm/code

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

NFOLDS = 3
SEED = 0
NROWS = None

In [2]:
data = pd.read_csv('../input/application_train.csv')
test = pd.read_csv('../input/application_test.csv')
prev = pd.read_csv('../input/previous_application.csv')

In [3]:
categorical_feats = [f for f in data.columns if data[f].dtype == 'object']

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])  # 범주를 정수로 인코딩
    test[f_] = indexer.get_indexer(test[f_])  # indexer에서 test[f_]의 인덱스를 찾음. 없으면 -1 반환    

In [4]:
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [5]:
# 팩터화된 범주 평균화
prev_cat_features = [f_ for f_ in prev.columns if prev[f_].dtype == 'object']

for f_ in prev_cat_features :
    prev[f_], _ = pd.factorize(prev[f_])  # 범주를 정수로 인코딩

In [6]:
avg_prev = prev.groupby('SK_ID_CURR').mean()  # id별 평균
cnt_prev = prev[['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').count() # id별 개수
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']  # prev id 추가
del avg_prev['SK_ID_PREV']

In [7]:
# id로 train, test 데이터와 avg_prev 병합
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

In [8]:
# 결측값 0으로 대체
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

In [9]:
# train, test 데이터셋 크기
ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [10]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]  # id 아닌 변수

x_train = x_train[features]  # id 아닌 변수를 갖는 데이터
x_test = x_test[features]

In [11]:
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

In [12]:
# 분류기 학습 및 예측하는 클래스

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [13]:
# catboost 학습 및 예측하는 클래스

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [14]:
# LightGBM 학습 및 예측하는 클래스

class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [15]:
# XGBoost 학습 및 예측하는 클래스

class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds',250)  # params 딕셔너리에서 'nrounds'와 해당값 삭제
                                                  # 250은 pop의 디폴트값으로 특정 키가 설정되지 않으면 반환할 값
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [16]:
# 객제지향함수 생성 - 해당 분류기(clf)로 학습 및 예측
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))  # np.empty : NFOLDS행 ntest크기 열로 array 생성
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)

In [17]:
# 파라미터 설정
et_params = {'n_estimators':200,'max_features':0.5,'max_depth':12,'min_samples_leaf':2,}
rf_params = {'n_estimators':200,'max_features':0.2,'max_depth':12,'min_samples_leaf':2,}
xgb_params = {'seed':0,'colsample_bytree':0.7,'subsample':0.7,'learning_rate':0.075,'objective':'binary:logistic','eval_metric':'logloss','max_depth':4,'num_parallel_tree':1,'min_child_weight':1,'nrounds':200}
catboost_params = {'iterations':200,'learning_rate':0.5,'depth':3,'l2_leaf_reg':40,'bootstrap_type':'Bernoulli','subsample':0.7,'scale_pos_weight':5,'eval_metric':'AUC','od_type':'Iter','allow_writing_files':False}
lightgbm_params = {'n_estimators':200,'learning_rate':0.1,'num_leaves':123,'colsample_bytree':0.8,'subsample':0.9,'max_depth':15,'reg_alpha':0.1,'reg_lambda':0.1,'min_split_gain':0.01,'min_child_weight':2  }

In [18]:
# 클래스 이용해 분류기 학습 및 예측
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf=CatBoostClassifier, seed=SEED, params=catboost_params)
lg = LightGBMWrapper(clf=LGBMClassifier, seed=SEED, params=lightgbm_params)

In [19]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

0:	total: 287ms	remaining: 57.2s
1:	total: 410ms	remaining: 40.6s
2:	total: 517ms	remaining: 33.9s
3:	total: 684ms	remaining: 33.5s
4:	total: 849ms	remaining: 33.1s
5:	total: 1.05s	remaining: 34s
6:	total: 1.24s	remaining: 34.1s
7:	total: 1.37s	remaining: 32.9s
8:	total: 1.52s	remaining: 32.3s
9:	total: 1.66s	remaining: 31.5s
10:	total: 1.79s	remaining: 30.8s
11:	total: 1.93s	remaining: 30.2s
12:	total: 2.08s	remaining: 30s
13:	total: 2.24s	remaining: 29.8s
14:	total: 2.42s	remaining: 29.9s
15:	total: 2.58s	remaining: 29.7s
16:	total: 2.73s	remaining: 29.4s
17:	total: 2.9s	remaining: 29.3s
18:	total: 3.03s	remaining: 28.9s
19:	total: 3.19s	remaining: 28.7s
20:	total: 3.33s	remaining: 28.4s
21:	total: 3.47s	remaining: 28.1s
22:	total: 3.6s	remaining: 27.7s
23:	total: 3.75s	remaining: 27.5s
24:	total: 3.88s	remaining: 27.2s
25:	total: 4.02s	remaining: 26.9s
26:	total: 4.17s	remaining: 26.7s
27:	total: 4.39s	remaining: 27s
28:	total: 4.6s	remaining: 27.1s
29:	total: 4.74s	remaining: 26.9s

43:	total: 5.04s	remaining: 17.9s
44:	total: 5.14s	remaining: 17.7s
45:	total: 5.23s	remaining: 17.5s
46:	total: 5.35s	remaining: 17.4s
47:	total: 5.45s	remaining: 17.3s
48:	total: 5.55s	remaining: 17.1s
49:	total: 5.64s	remaining: 16.9s
50:	total: 5.76s	remaining: 16.8s
51:	total: 5.87s	remaining: 16.7s
52:	total: 6.02s	remaining: 16.7s
53:	total: 6.17s	remaining: 16.7s
54:	total: 6.3s	remaining: 16.6s
55:	total: 6.42s	remaining: 16.5s
56:	total: 6.54s	remaining: 16.4s
57:	total: 6.64s	remaining: 16.3s
58:	total: 6.75s	remaining: 16.1s
59:	total: 6.86s	remaining: 16s
60:	total: 6.97s	remaining: 15.9s
61:	total: 7.06s	remaining: 15.7s
62:	total: 7.15s	remaining: 15.5s
63:	total: 7.24s	remaining: 15.4s
64:	total: 7.32s	remaining: 15.2s
65:	total: 7.42s	remaining: 15.1s
66:	total: 7.53s	remaining: 14.9s
67:	total: 7.64s	remaining: 14.8s
68:	total: 7.73s	remaining: 14.7s
69:	total: 7.84s	remaining: 14.6s
70:	total: 7.95s	remaining: 14.4s
71:	total: 8.04s	remaining: 14.3s
72:	total: 8.14s	

85:	total: 8.35s	remaining: 11.1s
86:	total: 8.48s	remaining: 11s
87:	total: 8.6s	remaining: 10.9s
88:	total: 8.69s	remaining: 10.8s
89:	total: 8.8s	remaining: 10.8s
90:	total: 8.89s	remaining: 10.7s
91:	total: 8.97s	remaining: 10.5s
92:	total: 9.04s	remaining: 10.4s
93:	total: 9.15s	remaining: 10.3s
94:	total: 9.23s	remaining: 10.2s
95:	total: 9.3s	remaining: 10.1s
96:	total: 9.39s	remaining: 9.97s
97:	total: 9.46s	remaining: 9.85s
98:	total: 9.55s	remaining: 9.74s
99:	total: 9.64s	remaining: 9.64s
100:	total: 9.73s	remaining: 9.54s
101:	total: 9.83s	remaining: 9.45s
102:	total: 9.93s	remaining: 9.35s
103:	total: 10s	remaining: 9.24s
104:	total: 10.1s	remaining: 9.13s
105:	total: 10.2s	remaining: 9.02s
106:	total: 10.3s	remaining: 8.91s
107:	total: 10.3s	remaining: 8.81s
108:	total: 10.4s	remaining: 8.7s
109:	total: 10.5s	remaining: 8.59s
110:	total: 10.6s	remaining: 8.5s
111:	total: 10.7s	remaining: 8.39s
112:	total: 10.8s	remaining: 8.31s
113:	total: 10.9s	remaining: 8.19s
114:	tota

In [21]:
print('XG-CV: {}'.format(sqrt(mean_squared_error(y_train,xg_oof_train))))
print('ET-CV: {}'.format(sqrt(mean_squared_error(y_train,et_oof_train))))
print('RF-CV: {}'.format(sqrt(mean_squared_error(y_train,rf_oof_train))))
print('RF-CV: {}'.format(sqrt(mean_squared_error(y_train,cb_oof_train))))

XG-CV: 0.25969546149690725
ET-CV: 0.26296502347137407
RF-CV: 0.2629856213006646
RF-CV: 0.33101423487560705


In [22]:
x_train = np.concatenate((xg_oof_train,et_oof_train,rf_oof_train,cb_oof_train),axis=1)
x_test = np.concatenate((xg_oof_test,et_oof_test,rf_oof_test,cb_oof_test),axis=1)

print('{},{}'.format(x_train.shape, x_test.shape))

(307511, 4),(48744, 4)


In [23]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

LogisticRegression()

In [24]:
test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

test[['SK_ID_CURR','TARGET']].to_csv('home_3_sion.csv',index=False,float_format='%.8f')