### __데이터 불러오기__

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import warnings
plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')

In [2]:
import pickle

with open('train_test_data.pkl', 'rb') as f:
    train, test = pickle.load(f)

submission = pd.read_csv('../data/sample_submission.csv', encoding='cp949')

In [3]:
train.head()

Unnamed: 0,num,date_time,power,temp,wind,hum,prec,sun,non_elec,solar,hour,month,day,weekday,holiday,heat_index,hour_by_3,cluster
0,1,2020-06-01 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,6,1,0,0,63.43008,0,0
1,1,2020-06-01 01:00:00,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,6,1,0,0,63.56993,0,0
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,6,1,0,0,63.22775,0,0
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,6,1,0,0,62.54339,1,0
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,6,1,0,0,62.3976,1,0


In [4]:
test.head()

Unnamed: 0,num,date_time,temp,wind,hum,prec,sun,non_elec,solar,hour,month,day,weekday,holiday,heat_index,hour_by_3,cluster
0,1,2020-08-25 00:00:00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,0,8,25,1,0,78.60228,0,0
1,1,2020-08-25 01:00:00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,1,8,25,1,0,78.60228,0,0
2,1,2020-08-25 02:00:00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,2,8,25,1,0,78.60228,0,0
3,1,2020-08-25 03:00:00,27.3,1.1,78.0,0.0,0.0,0.0,0.0,3,8,25,1,0,78.34006,1,0
4,1,2020-08-25 04:00:00,27.3,1.1,78.0,0.0,0.0,0.0,0.0,4,8,25,1,0,78.34006,1,0


### __변수 선택__

In [5]:
x_train = train.copy()
x_train = x_train.drop(['date_time','power','hour_by_3'], axis = 1, inplace = False)
y_train = train['power'].copy()

x_test = test.copy()
x_test = x_test.drop(['date_time','hour_by_3'], axis = 1, inplace = False)

In [6]:
x_train.head()

Unnamed: 0,num,temp,wind,hum,prec,sun,non_elec,solar,hour,month,day,weekday,holiday,heat_index,cluster
0,1,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,6,1,0,0,63.43008,0
1,1,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,6,1,0,0,63.56993,0
2,1,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,6,1,0,0,63.22775,0
3,1,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,6,1,0,0,62.54339,0
4,1,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,6,1,0,0,62.3976,0


### __평가지표__

시계열 데이터임을 고려하여, SMAPE(대칭 평균 절대 백분율 오차)를 평가지표로 사용하였습니다.
SMAPE는 0에서 200까지의 값을 갖고, 0에 가까울수록 예측이 잘 된 모델이다.

$$
SMAPE = \frac{1}{n} \sum_{i=1}^{n} \frac{|y_i - \hat{y}_i|}{\frac{|y_i| + |\hat{y}_i|}{2}} \times 100 
$$


SMAPE는 예측값과 실제값의 상대적인 차이를 백분율로 계산하므로, 예측값이 크거나 작은 경우에도 일관되게 성능을 평가할 수 있습니다. 이는 시계열 데이터가 시간에 따라 변동성이 크고, 각 시점에서의 예측 오차가 중요한 특성을 갖기 때문에 시계열 데이터에서 평가지표로 사용하기에 적합합니다.

또한, SMAPE는 예측값이 실제값보다 클 때와 작을 때 동일한 방식으로 오차를 계산하여, 과대 예측과 과소 예측을 균등하게 다룹니다. 이는 시계열 예측에서 예측값이 실제값을 초과하거나 미달할 수 있기 때문에, 대칭적인 평가 방식이 예측 성능을 공정하게 평가하는데 도움이 됩니다.

이러한 이유로 SMAPE는 시계열 예측 문제에서 모델의 성능을 공정하고 안정적으로 평가할 수 있는 평가지표로 사용됩니다. 이를 통해 예측 모델의 정확도를 개선하고, 보다 신뢰성 있는 예측 결과를 도출할 수 있습니다.

In [7]:
def SMAPE(y_true, y_pred):
    denominator = np.abs(y_true) + np.abs(y_pred)
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0  # 분모가 0인 경우는 제외
    return 200 * np.mean(diff)

### __모델 선택 및 학습__

일반 KFold를 사용할 경우, 학습 데이터셋은 건물번호 0에서 46번까지의 건물을 포함하고, 검증 데이터셋은 47에서 60번까지의 건물을 포함하게 되어 분석 결과에 문제가 발생할 수 있습니다. 이를 해결하기 위해, __건물번호를 기준으로 데이터셋을 구분__ 하고, 학습셋과 검증셋이 __건물번호를 균등하게 포함하도록 Stratified K-Fold 교차검증 방식을 사용__ 하였습니다.

모델을 함수로 구현하여 학습을 진행하였습니다.

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV

In [9]:
def num_sfkfold_split(clf, x, y, n_split=5, smape = True):
    rmse_scores = []
    smape_scores = []
    
    sfk = StratifiedKFold(n_split)
  
  
    for train_index, test_index in sfk.split(x, x['num']):
        x_train, x_test = x.values[train_index], x.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        clf.fit(x_train,y_train)
        pred = clf.predict(x_test)

   
        mse = mean_squared_error(y_test, pred)
        rmse_score = np.sqrt(mse)
        rmse_scores.append(rmse_score)


        if smape == True:
          smape_score = SMAPE(y_test, pred)
          smape_scores.append(smape_score)

    print('교차검증 RMSE: ', rmse_scores)
    print('평균 RMSE: ', np.mean(rmse_scores))
  

    if smape == True:
        print('교차검증 SMAPE: ', smape_scores)
        print('평균 SMAPE: ', np.mean(smape_scores))

#### __LGBM__

In [10]:
lgbm = LGBMRegressor(n_estimators = 1000, max_depth = 30, min_child_samples = 100, learning_rate = 0.1, subsample = 0.8, random_state=10, verbose=0)
num_sfkfold_split(lgbm, x_train, y_train, n_split = 5)

교차검증 RMSE:  [266.67149500816794, 220.80908530618083, 229.59069210147408, 245.53738351298205, 373.19017218117654]
평균 RMSE:  267.1597656219963
교차검증 SMAPE:  [8.305231283671297, 7.58543196892391, 7.425890115131906, 8.05345920022577, 10.317295120149264]
평균 SMAPE:  8.337461537620431


#### __Grid Search__

데이터의 크기가 크고 학습량이 많아 XGBoost모델을 사용하기엔 시간이 오래 걸려 부적합하다고 판단하였습니다.

따라서, 모델 학습 시간을 단축시키고 성능을 향상시키기 위해, 그리드서치를 진행하였으나, 해당 파트에서는 생략하도록한다.

In [24]:
# param_grid = {
#     'num_leaves': [31, 50, 100],  # 트리의 잎사귀 수
#     'learning_rate': [0.01, 0.1, 0.3],  # 학습률
#     'n_estimators': [100],  # 트리의 개수
#     'subsample': [0.7, 0.8, 1.0],  # 데이터 샘플링 비율
#     'colsample_bytree': [0.7, 0.8, 1.0],  # 특성 샘플링 비율
#     'min_child_samples': [5, 10, 20]  # 리프 노드의 최소 샘플 수
# }

# lgbm = LGBMRegressor(random_state = 10, verbose = -1)

# grid_search = GridSearchCV(
#     estimator = lgbm,
#     param_grid = param_grid,
#     cv = 5,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,  
#     verbose=2,  
#     refit=True
# )
# grid_search.fit(x_train, y_train)

# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# with open('best_params.pkl', 'wb') as f:
#     pickle.dump(best_params, f)

# with open('best_model.pkl', 'wb') as f:
#     pickle.dump( grid_search.best_estimator_, f)

# with open('best_model.pkl', 'rb') as f:
#     best_model = pickle.load(f)

# log_sfkfold_split(best_model, x_train, y_train, n_split=5, smape=True)

### __제출__

In [11]:
lgbm.fit(x_train, y_train)

y_pred = lgbm.predict(x_test)

submission['answer'] = y_pred

submission.to_csv('submission.csv', index=False)