In [1]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  

- 처음에는 MAX_ROUNDS를 높게 설정하고, OPTIMIZE_ROUNDS를 사용하여 적절한 ROUND 수에 대한 아이디어를 얻는 것을 추천

In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

In [3]:
# gini 계산
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [18]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series = None,
                  val_series = None,
                  tst_series = None,
                  target = None,
                  min_samples_leaf = 1,
                  smoothing = 1,
                  noise_level = 0):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    
    # target 평균 계산
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    
    # smoothing 계산
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    
    # 모든 target data에 평균 함수 적용
    prior = target.mean()
    
    # 카운트가 클 수록 full_avg가 덜 고려
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    
    # train, test series에 averages 적용
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge는 인덱스를 유지하지 않으므로 복원해야함
    ft_trn_series.index = trn_series.index 
    
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge는 인덱스를 유지하지 않으므로 복원해야함
    ft_val_series.index = val_series.index 
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge는 인덱스를 유지하지 않으므로 복원해야함
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [19]:
train_df = pd.read_csv('Porto_train.csv', na_values='-1')
test_df = pd.read_csv('Porto_test.csv', na_values='-1')

In [20]:
train_features = [
    "ps_car_13",
    "ps_reg_03",
    "ps_ind_05_cat",
    "ps_ind_03", 
    "ps_ind_15", 
    "ps_reg_02", 
    "ps_car_14", 
    "ps_car_12", 
    "ps_car_01_cat",  
    "ps_car_07_cat",
    "ps_ind_17_bin",
    "ps_car_03_cat", 
    "ps_reg_01", 
    "ps_car_15", 
    "ps_ind_01", 
    "ps_ind_16_bin",
    "ps_ind_07_bin",
    "ps_car_06_cat",
    "ps_car_04_cat", 
    "ps_ind_06_bin", 
    "ps_car_09_cat", 
    "ps_car_02_cat",  
    "ps_ind_02_cat",
    "ps_car_11",
    "ps_car_05_cat",
    "ps_calc_09",
    "ps_calc_05",
    "ps_ind_08_bin",
    "ps_car_08_cat",
    "ps_ind_09_bin", 
    "ps_ind_04_cat", 
    "ps_ind_18_bin",
    "ps_ind_12_bin",
    "ps_ind_14"]

combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat')]

In [21]:
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f' % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # 라벨 인코딩
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if '_cat' in f]

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

In [22]:
y_valid_pred = 0 * y
y_test_pred = 0

In [23]:
# 폴드 설정
K = 5
kf = KFold(n_splits=K, random_state=1, shuffle=True)
np.random.seed(0)

In [24]:
# classifier 설정
model = XGBClassifier(n_estimators = MAX_ROUNDS,
                      max_depth = 4,
                      objective = 'binary:logistic',
                      learning_rate = LEARNING_RATE,
                      subsample = .8,
                      min_child_weight = 6,
                      colsample_bytree = .8,
                      scale_pos_weight = 1.6, 
                      gamma = 10,
                      reg_alpha = 8, 
                      reg_lambda = 1.3)

In [25]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # 이 폴드에 대한 데이터 생성
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index, :].copy(), X.iloc[test_index, :].copy()
    X_test = test_df.copy()
    print('\n Fold', i)
    
    # 데이터 인코딩
    for f in f_cats:
        X_train[f + '_avg'], X_valid[f + '_avg'], X_test[f + '_avg'] = target_encode(
                                                        trn_series = X_train[f],
                                                        val_series = X_valid[f],
                                                        tst_series = X_test[f],
                                                        target = y_train,
                                                        min_samples_leaf = 200,
                                                        smoothing = 10,
                                                        noise_level = 0)
        
    # 이 폴드에 대한 모델 수행
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set = eval_set,
                               eval_metric = gini_xgb,
                               early_stopping_rounds = EARLY_STOPPING_ROUNDS,
                               verbose = False)
        print('Best N trees = ', model.best_ntree_limit)
        print('Best gini = ', model.best_score)
    else:
        fit_model = model.fit(X_train, y_train)
        
    # 이 폴드에 대한 validation 예측값 생성
    pred = fit_model.predict_proba(X_valid)[:, 1]
    print('Gini = ', eval_gini(y_valid, pred))
    y_valid_pred.iloc[test_index] = pred
    
    # 테스트 세트 예측값 계산
    y_test_pred += fit_model.predict_proba(X_test)[:, 1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # 테스트 세트 예측값 평균

print('\n Gini for full training set:')
eval_gini(y, y_valid_pred)


 Fold 0
Gini =  0.2851065280442412

 Fold 1
Gini =  0.28185495483845957

 Fold 2
Gini =  0.2742993177535197

 Fold 3
Gini =  0.29892986639933017

 Fold 4
Gini =  0.2857903122299573

 Gini for full training set:


0.28496053325938575

In [26]:
# stacking/ensembling에 대한 validation 예측값 저장
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('xgb_valid.csv', float_format='%.6f', index=False)

In [27]:
# submission file 생성
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('xgb_submit.csv', float_format='%.6f', index=False)