In [1]:
## Import packages

import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold # cross_validation is not available -> module_selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [20]:
NFOLDS = 3
SEED = 0
NROWS = None

In [21]:
data = pd.read_csv('/Users/hayeon.c/24-1-adv-study/w2/application_train.csv')
test = pd.read_csv('/Users/hayeon.c/24-1-adv-study/w2/application_test.csv')
prev = pd.read_csv('/Users/hayeon.c/24-1-adv-study/w2/previous_application.csv')

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


In [23]:
prev.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [24]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object' #data type이 'object'면 categorial feature
]

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_]) # 범주형 데이터를 수치형 데이터로 변환한다. 반환값은 변환된 숫자로 구성된 넘파이 어레이와 변환에 사용된 순서대로의 범주형 데이터로 구성된 인덱스로 구성된 튜플
    test[f_] = indexer.get_indexer(test[f_]) # 컬럼 이름을 인자로 받으면 거기에 해당하는 인덱스 번호를 반환 -> training 데이터에 사용된 인코딩이 test 데이터에 적용되도록

gc.enable() # 자동 가비지 수거 활성화 - 메모리 관리

y_train = data['TARGET']
del data['TARGET'] # y_train의 feature와 data에 있는 라벨을 별개로 구성해야 하기 때문에

In [25]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

avg_prev = prev.groupby('SK_ID_CURR').mean() # customer ID - avg values of previous applications features
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count() # count previous applications
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV'] 
del avg_prev['SK_ID_PREV']

In [34]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

excluded_feats = ['SK_ID_CURR'] # identifier이고 예측에 유의하지 않으므로
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED) # KFold cross-validator 교차검증 셋업

In [35]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [37]:
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [38]:
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [39]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train) # DMatrix: np 입력 파라미터를 받아서 만들어지는 XGBoost의 전용 데이터 세트. 주요 입력 파라미터는 data와 label입니다. 여기서 data는 피처 데이터 세트, label은 분류의 경우 레이블 데이터, 회귀의 경우 종속값 데이터입니다.
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [40]:
def get_oof(clf): # out-of-fold
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

        oof_test[:] = oof_test_skf.mean(axis=0)
        return oof_train.reshape(-1,1), oof_test.reshape(-1, 1)

In [None]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [42]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

Parameters: { "silent" } are not used.



0:	total: 84.4ms	remaining: 16.8s
1:	total: 106ms	remaining: 10.5s
2:	total: 124ms	remaining: 8.12s
3:	total: 149ms	remaining: 7.3s
4:	total: 171ms	remaining: 6.67s
5:	total: 194ms	remaining: 6.28s
6:	total: 216ms	remaining: 5.94s
7:	total: 238ms	remaining: 5.72s
8:	total: 260ms	remaining: 5.51s
9:	total: 278ms	remaining: 5.28s
10:	total: 298ms	remaining: 5.11s
11:	total: 332ms	remaining: 5.2s
12:	total: 355ms	remaining: 5.11s
13:	total: 375ms	remaining: 4.98s
14:	total: 394ms	remaining: 4.86s
15:	total: 413ms	remaining: 4.75s
16:	total: 436ms	remaining: 4.7s
17:	total: 456ms	remaining: 4.61s
18:	total: 475ms	remaining: 4.53s
19:	total: 498ms	remaining: 4.48s
20:	total: 523ms	remaining: 4.46s
21:	total: 540ms	remaining: 4.37s
22:	total: 560ms	remaining: 4.31s
23:	total: 579ms	remaining: 4.24s
24:	total: 598ms	remaining: 4.19s
25:	total: 618ms	remaining: 4.13s
26:	total: 638ms	remaining: 4.09s
27:	total: 659ms	remaining: 4.05s
28:	total: 677ms	remaining: 3.99s
29:	total: 698ms	remaining

In [43]:
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.27633787146735905
ET-CV: 0.2773447940364592
RF-CV: 0.2773165455933669
RF-CV: 0.3006819990248219


In [44]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train,y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False, float_format='%.8f')

(307511, 4),(48744, 4)
