# Stacking Ensemble 

## XGB + LGBM + Catboost + RandomForest

In [25]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import catboost
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split,GridSearchCV
from tqdm import tqdm
import matplotlib.pyplot as plt

from pathlib import Path
import warnings

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool
warnings.filterwarnings(action = 'ignore')

## 데이터 불러오기, 전처리 및 Feature Engineering
: Catboost.ipynb 파일과 동일하게 진행하였기에 간단하게 코드만 첨부하도록 하겠습니다. 

In [3]:
path = 'C:/Users/JiwonMoon/Desktop/Data/open/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')

In [4]:
train.fillna('NaN',inplace=True)
test.fillna('NaN', inplace=True)

In [5]:
train = train[(train['family_size'] <= 7)]
train = train.reset_index(drop=True)

In [6]:
train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

In [7]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

In [8]:
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])
    test[feat]=np.abs(test[feat])

In [9]:
for df in [train,test]:
    # before_EMPLOYED: 고용되기 전까지의 일수
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)

     #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

     #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

    #ability: 소득/(살아온 일수+ 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean: 소득/ 가족 수
    df['income_mean'] = df['income_total'] / df['family_size']

    #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [10]:
cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
train.drop(cols, axis=1, inplace=True)
test.drop(cols, axis=1, inplace=True)

In [11]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  18
Number of Categorical features:  9


In [12]:
for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

In [13]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

In [14]:
kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

In [15]:
numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])


In [16]:
n_est = 2000
seed = 42
n_fold = 5
n_class = 3

target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

## 첫번째 LightGBM

In [20]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds = []

for train_idx,valid_idx in skfold.split(X,y):
    folds.append((train_idx,valid_idx))

lgb_pred = np.zeros((X.shape[0], n_class))
lgb_pred_test = np.zeros((X_test.shape[0], n_class))

for fold in range(n_fold):
    print(f'\n----------------- Fold {fold} -----------------\n')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]

    lgb_clf = LGBMClassifier(max_depth=24,
                       num_leaves=110,
                       colsample_bytree=0.3,
                       n_estimators=230, 
                       min_child_samples=2,
                       subsample=0.9,
                       subsample_freq=2,
                       learning_rate=0.09,
                       random_state=2021,
                       verbose = 50)

    lgb_clf.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=100, verbose=100)

    lgb_pred[valid_idx] = lgb_clf.predict_proba(X_valid)
    lgb_pred_test += lgb_clf.predict_proba(X_test)/n_fold

    print(f'CV Log Loss Score: {log_loss(y_valid, lgb_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, lgb_pred):.6f}')


----------------- Fold 0 -----------------

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.821420
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.118628
[LightGBM] [Debug] init for col-wise cost 0.000703 seconds, init for row-wise cost 0.000751 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1731
[LightGBM] [Info] Number of data points in the train set: 21160, number of used features: 28
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score -2.105487
[LightGBM] [Info] Start training from score -1.440078
[LightGBM] [Info] Start training from score -0.444251
[LightGBM] [Debug] Re-bagging, using 19151 data to train
[LightGBM] [Debug] Trained a tree with leaves = 110 and depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 110 and depth = 19
[LightGBM

In [21]:
from pathlib import Path
tst_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')
val_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')

algo_name = 'lgb'
feature_name = '2021_j'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, lgb_pred, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, lgb_pred_test, fmt='%.6f', delimiter=',')

## 두번째 LightGBM

In [22]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds = []

for train_idx,valid_idx in skfold.split(X,y):
    folds.append((train_idx,valid_idx))

lgb_pred_2 = np.zeros((X.shape[0], n_class))
lgb_pred_test_2 = np.zeros((X_test.shape[0], n_class))

for fold in range(n_fold):
    print(f'\n----------------- Fold {fold} -----------------\n')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]

    lgb_clf = LGBMClassifier(max_depth=24,
                       num_leaves=110,
                       colsample_bytree=0.3,
                       n_estimators=230, 
                       min_child_samples=2,
                       subsample=0.9,
                       subsample_freq=2,
                       learning_rate=0.09,
                       random_state=2021,
                       verbose = 50)

    lgb_clf.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=100, verbose=100)

    lgb_pred_2[valid_idx] = lgb_clf.predict_proba(X_valid)
    lgb_pred_test_2 += lgb_clf.predict_proba(X_test)/n_fold

    print(f'CV Log Loss Score: {log_loss(y_valid, lgb_pred_2[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, lgb_pred_2):.6f}')


----------------- Fold 0 -----------------

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.821420
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.118628
[LightGBM] [Debug] init for col-wise cost 0.000584 seconds, init for row-wise cost 0.000676 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1731
[LightGBM] [Info] Number of data points in the train set: 21160, number of used features: 28
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score -2.105487
[LightGBM] [Info] Start training from score -1.440078
[LightGBM] [Info] Start training from score -0.444251
[LightGBM] [Debug] Re-bagging, using 19151 data to train
[LightGBM] [Debug] Trained a tree with leaves = 110 and depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 110 and depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 110 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 110

In [23]:
from pathlib import Path
tst_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')
val_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')

algo_name = 'lgb2'
feature_name = '2021'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, lgb_pred_2, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, lgb_pred_test_2, fmt='%.6f', delimiter=',')

## XGBoost

In [27]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds = []

for train_idx,valid_idx in skfold.split(X,y):
    folds.append((train_idx,valid_idx))

xgb_pred = np.zeros((X.shape[0], n_class))
xgb_pred_test = np.zeros((X_test.shape[0], n_class))

for fold in range(n_fold):
    print(f'\n----------------- Fold {fold} -----------------\n')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]

    xgb_clf = xgb.XGBClassifier(max_depth=10,
                                colsample_bytree=0.5,
                                colsample_bylevel=0.8,
                                colsample_bynode=0.5,
                                min_child_weight=0.5,
                                gamma=0,
                                max_delta_step=0,
                                subsample=0.9,
                                reg_alpha=0.09,
                                reg_lambda=0.6,
                                n_estimators=50,
                               random_state = 2021)

    xgb_clf.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=100, verbose=100)

    xgb_pred[valid_idx] = xgb_clf.predict_proba(X_valid)
    xgb_pred_test += xgb_clf.predict_proba(X_test)/n_fold

    print(f'CV Log Loss Score: {log_loss(y_valid, xgb_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, xgb_pred):.6f}')
print('----------------------------------------------------------------------')
print(confusion_matrix(y, np.argmax(xgb_pred, axis=1)))


----------------- Fold 0 -----------------

[0]	validation_0-mlogloss:0.98868
[49]	validation_0-mlogloss:0.69742
CV Log Loss Score: 0.691841

----------------- Fold 1 -----------------

[0]	validation_0-mlogloss:0.97537
[49]	validation_0-mlogloss:0.71471
CV Log Loss Score: 0.709301

----------------- Fold 2 -----------------

[0]	validation_0-mlogloss:0.97353
[49]	validation_0-mlogloss:0.72052
CV Log Loss Score: 0.712054

----------------- Fold 3 -----------------

[0]	validation_0-mlogloss:0.97359
[49]	validation_0-mlogloss:0.70689
CV Log Loss Score: 0.700293

----------------- Fold 4 -----------------

[0]	validation_0-mlogloss:0.97456
[49]	validation_0-mlogloss:0.71555
CV Log Loss Score: 0.704937
	Log Loss: 0.703685
----------------------------------------------------------------------
[[  697   459  2066]
 [  183  2657  3427]
 [  316   760 15886]]


In [28]:
from pathlib import Path
tst_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')
val_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')

algo_name = 'xgb'
feature_name = '2021'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, xgb_pred, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, xgb_pred_test, fmt='%.6f', delimiter=',')

## CatBoost

In [29]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds = []

for train_idx,valid_idx in skfold.split(X,y):
    folds.append((train_idx,valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))

cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')
print('----------------------------------------------------------------------')
print(confusion_matrix(y, np.argmax(cat_pred, axis=1)))


----------------- Fold 0 -----------------

Learning rate set to 0.114773
0:	learn: 1.0357892	test: 1.0352724	best: 1.0352724 (0)	total: 29.6ms	remaining: 29.6s
100:	learn: 0.7112470	test: 0.6762806	best: 0.6762806 (100)	total: 2.83s	remaining: 25.2s
200:	learn: 0.6880928	test: 0.6753674	best: 0.6749146 (176)	total: 5.85s	remaining: 23.2s
300:	learn: 0.6681244	test: 0.6760540	best: 0.6746916 (221)	total: 8.86s	remaining: 20.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6746915822
bestIteration = 221

Shrink model to first 222 iterations.
CV Log Loss Score: 0.674692

----------------- Fold 1 -----------------

Learning rate set to 0.114773
0:	learn: 1.0349608	test: 1.0353669	best: 1.0353669 (0)	total: 34.2ms	remaining: 34.1s
100:	learn: 0.7109164	test: 0.6795991	best: 0.6795991 (100)	total: 2.98s	remaining: 26.6s
200:	learn: 0.6892807	test: 0.6777558	best: 0.6776499 (171)	total: 5.96s	remaining: 23.7s
Stopped by overfitting detector  (100 iterations wait)

bes

In [30]:
from pathlib import Path
tst_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')
val_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')

algo_name = 'cat'
feature_name = '2021'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, cat_pred, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, cat_pred_test, fmt='%.6f', delimiter=',')

## RandomForest

In [32]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds = []

for train_idx,valid_idx in skfold.split(X,y):
    folds.append((train_idx,valid_idx))

rf_pred = np.zeros((X.shape[0], n_class))
rf_pred_test = np.zeros((X_test.shape[0], n_class))

for fold in range(n_fold):
    print(f'\n----------------- Fold {fold} -----------------\n')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]

    rf_clf = RandomForestClassifier(n_estimators = 7000, 
                                random_state=2021,
                                max_features = 3,
                                max_depth = 28,
                                min_samples_split = 8,
                                n_jobs=4)


    rf_clf.fit(X_train,y_train)

    rf_pred[valid_idx] = rf_clf.predict_proba(X_valid)
    rf_pred_test += rf_clf.predict_proba(X_test)/n_fold

    print(f'CV Log Loss Score: {log_loss(y_valid, rf_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, rf_pred):.6f}')
print('----------------------------------------------------------------------')
print(confusion_matrix(y, np.argmax(rf_pred, axis=1)))


----------------- Fold 0 -----------------

CV Log Loss Score: 0.685930

----------------- Fold 1 -----------------

CV Log Loss Score: 0.690647

----------------- Fold 2 -----------------

CV Log Loss Score: 0.698988

----------------- Fold 3 -----------------

CV Log Loss Score: 0.686927

----------------- Fold 4 -----------------

CV Log Loss Score: 0.699525
	Log Loss: 0.692403
----------------------------------------------------------------------
[[  728   527  1967]
 [  175  2949  3143]
 [  364   902 15696]]


In [33]:
from pathlib import Path
tst_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')
val_dir = Path('C:/Users/JiwonMoon/Desktop/대학원수업/비즈니스 어낼리틱스/')

algo_name = 'rf'
feature_name = '2021'
model_name = f'{algo_name}_{feature_name}'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

np.savetxt(p_val_file, rf_pred, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, rf_pred_test, fmt='%.6f', delimiter=',')

## Stacking

In [35]:
model_names = ['rf_2021',
               'xgb_2021',
               'lgb_2021_j',
                'cat_2021',
               'lgb2_2021'
              ]


stk_trn = []
stk_tst = []
feature_names = []
for model in model_names:
    stk_trn.append(np.loadtxt(val_dir / f'{model}.val.csv', delimiter=','))
    stk_tst.append(np.loadtxt(tst_dir / f'{model}.tst.csv', delimiter=','))
    feature_names += [f'{model}_credit0', f'{model}_credit1', f'{model}_credit2']
    
stk_trn = np.hstack(stk_trn)
stk_tst = np.hstack(stk_tst)
feature_names 

['rf_2021_credit0',
 'rf_2021_credit1',
 'rf_2021_credit2',
 'xgb_2021_credit0',
 'xgb_2021_credit1',
 'xgb_2021_credit2',
 'lgb_2021_j_credit0',
 'lgb_2021_j_credit1',
 'lgb_2021_j_credit2',
 'cat_2021_credit0',
 'cat_2021_credit1',
 'cat_2021_credit2',
 'lgb2_2021_credit0',
 'lgb2_2021_credit1',
 'lgb2_2021_credit2']

In [36]:
n_fold = 5
seed = 2021
n_class = 3

target=train['credit'].values
y=target.copy()

In [37]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds = []

for train_idx,valid_idx in skfold.split(X,y):
    folds.append((train_idx,valid_idx))

clf_pred = np.zeros((X.shape[0], n_class))
clf_pred_test = np.zeros((X_test.shape[0], n_class))

for fold in range(n_fold):
    print(f'\n----------------- Fold {fold} -----------------\n')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]

    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             max_depth=2,
                             num_leaves=10,
                             colsample_bytree=0.9,
                             n_estimators=110, 
                             min_child_samples=24,
                             subsample=0.5,
                             subsample_freq=1,
                             learning_rate=0.1,
                             random_state=1557,
                             verbose = 50)

    clf.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],
            eval_metric='logloss',early_stopping_rounds=100, verbose=100)

    clf_pred[valid_idx] = clf.predict_proba(X_valid)
    clf_pred_test += clf.predict_proba(X_test)/n_fold

    print(f'CV Log Loss Score: {log_loss(y_valid, clf_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, clf_pred):.6f}')
print('----------------------------------------------------------------------')
print(confusion_matrix(y, np.argmax(clf_pred, axis=1)))


----------------- Fold 0 -----------------

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.820652
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.118517
[LightGBM] [Debug] init for col-wise cost 0.000773 seconds, init for row-wise cost 0.002014 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 21160, number of used features: 28
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score -2.105487
[LightGBM] [Info] Start training from score -1.440078
[LightGBM] [Info] Start training from score -0.444251
[LightGBM] [Debug] Re-bagging, using 10571 data to train
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[LightGBM] [Debug] Re-bagging, using 10537 data to train
[Li

## 레퍼런스

https://dacon.io/competitions/official/235713/codeshare/2788?page=1&dtype=recent