In [None]:
!pip install optuna

In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets, ensemble
from lightgbm import LGBMRegressor
from tqdm import tqdm

import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/DACON/자율주행 안테나 성능 예측 경진대회(LG 주최)/LG_outputs/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')
y_feature_spec_info = pd.read_csv(path + 'meta/y_feature_spec_info.csv')

In [4]:
target_list = []
min_list = []
max_list = []
for i in range(len(y_feature_spec_info)):
    target_list.append(y_feature_spec_info['Feature'][i])
    min_list.append(y_feature_spec_info['최소'][i])
    max_list.append(y_feature_spec_info['최대'][i])

In [5]:
X = train[[
    'X_01', 'X_03', 'X_05', 'X_06', 'X_07', 'X_08', 'X_09', 'X_12', 'X_14', 'X_15', 'X_16', 'X_17', 'X_19', 'X_20', 
    'X_21', 'X_22', 'X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29', 'X_30', 
    'X_31', 'X_32', 'X_33', 'X_35', 'X_37', 'X_38', 'X_39', 'X_40', 
    'X_41', 'X_42', 'X_43', 'X_44', 'X_46', 'X_49', 'X_50', 
    'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56'
    ]]
y = train[[
    'Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
    'Y_11', 'Y_12', 'Y_13', 'Y_14'
    ]]

In [6]:
target = test[X.columns]

In [7]:
# LG 평가 기준을 반영한 점수 정산 함수
def lg_nrmse(gt, preds):
    all_nrmse = []
    for idx in range(14): 
        rmse = mean_squared_error(gt.iloc[:,idx], preds[:,idx], squared = False)
        nrmse = rmse / np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [8]:
kf = KFold(n_splits = 10, random_state = 2022, shuffle = True)

In [None]:
def objective_cat(trial):

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

    params = {
        'objective': 'regression',
        'verbose': -1,
        'metric': 'rmse', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'random_state': trial.suggest_int('random_state', 0, 100),
    }

    # LGBMRegressor을 통해 예측 진행
    model = MultiOutputRegressor(LGBMRegressor(**params))
    model.fit(x_train, y_train)

    pred = model.predict(x_test)

    # LG 채점 기준(nrmse)에 따라 채점 진행 후 점수 반환
    rmse = lg_nrmse(y_test, pred)


    return rmse

In [None]:
# Optuna를 통해 최적의 모델 선정
study = optuna.create_study(direction="minimize")
study.optimize(objective_cat, n_trials=300)

# 위에서 구한 최적의 모델의 parameter을 저장
params=study.best_params 
params['metric'] = 'rmse'
params['objective'] = 'regression'

In [None]:
# K Fold 진행
NRMSES = []
pred = np.zeros((target.shape[0], 14))
for i, idx in enumerate(kf.split(X, y)):

    # train, valid split해준다.
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    model = MultiOutputRegressor(LGBMRegressor(**params))
    model.fit(tr_x, tr_y)
    
    val_pred = model.predict(val_x)
    NRMSE = lg_nrmse(val_y, val_pred)
    print(f"{i + 1} Fold NRMSE = {NRMSE}")
    NRMSES.append(NRMSE)
    
    # K Fold로 split된 train set으로 training된 다수의 모델의 output을 단순 평균 내어줌(model Ensemble)
    fold_pred = model.predict(target) / kf.n_splits
    pred += fold_pred

In [None]:
np.mean(NRMSES) 

In [None]:
submission.iloc[:, 1:] = pred

In [None]:
submission.to_csv(path + 'lgbm_kfold_optuna_fold10_rm21318343645_best.csv', index = False)