## Import

In [None]:
# for read data
import os
import numpy as np
import pandas as pd

# model
from lightgbm import LGBMClassifier     

# tunning
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization

# Out of Fold
from sklearn.metrics import log_loss  
from sklearn.model_selection import StratifiedKFold

# for save
import joblib

### Read data

In [None]:
path = (os.path.abspath("./input"))

X_train = pd.read_csv(path +'/CAT_train.csv', encoding='cp949')
X_test = pd.read_csv(path +'/CAT_test.csv', encoding='cp949')
y_train = pd.read_csv(path +'/y_train.csv', encoding='cp949').group

In [None]:
train_ID, test_ID = X_train.custid, X_test.custid
del X_train['custid'], X_test['custid']

### ▶ Bayesian Optimization

In [None]:
SKF = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
def lgbm_bayesian(n_estimators, min_child_samples):
    params = {'learning_rate':0.03,
              'n_estimators':int(round(n_estimators)),
              'min_child_samples': int(round(min_child_samples))}
    
    bays = LGBMClassifier(**params, objective='multiclass', metrics='multi_logloss', 
                          gpu_use_dp=True, gpu_device_id=0, num_gpu=1, random_state=0)  
    
    logloss = []
    for tr_idx, val_idx in SKF.split(X_train, y_train):
        tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
        score = log_loss(val_y, bays.fit(tr_x, tr_y).predict_proba(val_x))
        logloss.append(score)
    return np.mean(logloss)

In [None]:
bayesian_params = {'n_estimators':(900, 1100),
                   'min_child_samples':(60, 120)}

In [None]:
lgbm_bay = BayesianOptimization(f=lgbm_bayesian, pbounds=bayesian_params, random_state=0)
lgbm_bay.maximize(init_points=5, n_iter=100)

In [None]:
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbm_bay.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmin(np.array(target_list)))

In [None]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbm_bay.res[np.argmin(np.array(target_list))]['params']
max_dict['max_depth'] = int(max_dict['max_depth'])
max_dict['min_child_samples'] = int(max_dict['min_child_samples'])
max_dict['n_estimators'] = int(max_dict['n_estimators'])
max_dict['num_leaves'] = int(max_dict['num_leaves'])
max_dict

In [None]:
# score: 1.5545960092945523
lgbm_bayesian = LGBMClassifier(**max_dict, objective='multiclass', metrics='multi_logloss', 
                               gpu_use_dp=True, gpu_device_id=0, num_gpu=1, random_state=0)  

### ▶ Out of Fold

In [None]:
model = lgbm_bayesian

In [None]:
lgbm_pred = np.zeros((X_test.shape[0], 8))
loss_list = []
for tr_idx, val_idx in SKF.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    model.fit(tr_x, tr_y)
    pred = model.predict_proba(val_x)
    loss = log_loss(val_y, pred)
    loss_list.append(loss)
    
    sub_pred = np.array(model.predict_proba(X_test)) / 5  # averaging
    lgbm_pred += sub_pred
print(f'{model.__class__.__name__}의 5fold 평균 Log Loss는 {np.mean(loss_list)}')

### ▶ Deploy Model&Submission data

In [None]:
pred = pd.DataFrame(lgbm_pred)
pred.columns = ['F20','F30','F40','F50','M20','M30','M40','M50']
submissions = pd.concat([pd.Series(test_ID, name="ID"), pred] ,axis=1)

In [None]:
sub_path = (os.path.abspath("./submission"))

fname = '/MLGBM_FCAT.csv' # Model_FeatureSet
submissions.to_csv(sub_path+fname, index=False)