### Import

In [1]:
import numpy as np
import pandas as pd
import warnings ; warnings.filterwarnings('ignore')

# Tunning & Model
from lightgbm import LGBMRegressor   
from sklearn import model_selection
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold

### Read Data

In [2]:
X_train = pd.read_csv('../data/FS1_train.csv')
X_test = pd.read_csv('../data/FS1_test.csv')
sample = pd.read_csv('../data/submission/sample_submission.csv')

In [3]:
del X_train['id'], X_test['id']
y_train = X_train['target'] ; del X_train['target']

### Set Hyper Parameter
- Optuna와 BayesianOptimization을 사용해 Tunning하였지만 과적합이 일어났다.
- 적정 수준의 Parameter Value에서 변경해 나가며 일반화할 수 있게 조정한다.

In [None]:
# file_name: 20221110_0916.csv
# val_mae: , public score: 3.1142610395, oof score: 0.4965870291789201
trial1 = {'learning_rate': 0.0517 , 'n_estimators': 2913, 'max_depth': 35, 'num_leaves': 144}

In [None]:
# file_name:
# val_mae:, public score: - , oof score: 0.4530869192696806
trial2 = {'learning_rate': 0.05 , 'n_estimators': 2400, 'max_depth': 35, 'num_leaves': 144}

In [None]:
# file_name:
# val_mae:, public score: - , oof score: 0.44595349754134034
trial3 = {'learning_rate': 0.045 , 'n_estimators': 2400, 'max_depth': 35, 'num_leaves': 155}

In [None]:
# file_name: 20221113_0944_1.csv
# val_mae: 2.8113549463504324, public score: -, oof score: 0.48441562405644567
trial4 = {'learning_rate': 0.05 , 'n_estimators': 2400, 'max_depth': 35, 'num_leaves': 170}

In [None]:
# file_name: 20221113_2151.csv
# val_mae: 2.8138894943748114, public score: - , oof score: 0.4651534994968449
trial5 = {'learning_rate': 0.05 , 'n_estimators': 2400, 'max_depth': 35, 'num_leaves': 155}

### Out of Fold
- 과적합을 방지하기 위해 Out of Fold로 submission을 생성한다.

In [None]:
model = LGBMRegressor(**trial2, metrics='mae', random_state=2022)

In [16]:
SKF = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 2022)

In [17]:
lgbm_pred = np.zeros(X_test.shape[0])
mae_list = []
for tr_idx, val_idx in SKF.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    model.fit(tr_x, tr_y)
    pred = model.predict(val_x)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    
    sub_pred = np.array(model.predict(X_test)) / 10  # averaging
    lgbm_pred += sub_pred
print(f'{model.__class__.__name__}의 10fold 평균 MAE는 {np.mean(mae_list)}')

LGBMRegressor의 10fold 평균 MAE는 2.80771437557556


### Submit

In [18]:
# 사용할 모델로 예측한 뒤 저장한다.
sample['target'] = lgbm_pred

In [None]:
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')