In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# 한글폰트 설정
import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
    print('Mac version')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
    print('Windows version')
elif platform.system() == 'Linux':
    path = "/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    plt.rc('font', family=font_name)
    print('Linux version')
else:
    print('Unknown system... sorry~~~~')

In [1]:
# 사용할 패키지
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import date

In [2]:
# 데이터 불러오기
root = "./"
train_df = pd.read_csv(root+"datasets/train.csv", encoding = 'CP949')
test_df = pd.read_csv(root+"datasets/test.csv", encoding = 'CP949')

In [59]:
from sklearn.metrics import make_scorer  # loss function 커스터마이징
# SMAPE 정의
def smape(true, pred) :
    true = np.array(true)
    pred = np.array(pred)
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred)))  # *2 , *100은 상수이므로 생략
SMAPE = make_scorer(smape, greater_is_better=False)  # smape 값이 작아져야하므로 False

In [60]:
# StratifiedKFold 기반 분리
from sklearn.model_selection import StratifiedKFold
splits = StratifiedKFold(n_splits = 5)

In [61]:
n_iter = 0
for train_idx, valid_idx in splits.split(X,X['num']) :
    n_iter += 1
    X_train = X.iloc[train_idx, :]
    y_train = y[train_idx]
    
    X_val = X.iloc[valid_idx, : ]
    y_val = y[valid_idx]
    print("-----{}번째 교차검증-----".format(n_iter), end = '\n')
    print("X_train 교차검증 개수: \n{}".format(len(X_train.value_counts())),end = '\n')
    print("y_train 교차검증 개수: \n{}".format(len(y_train.value_counts())), end = '\n\n')

-----1번째 교차검증-----
X_train 교차검증 개수: 
97918
y_train 교차검증 개수: 
46610

-----2번째 교차검증-----
X_train 교차검증 개수: 
97919
y_train 교차검증 개수: 
46674

-----3번째 교차검증-----
X_train 교차검증 개수: 
97920
y_train 교차검증 개수: 
46339

-----4번째 교차검증-----
X_train 교차검증 개수: 
97919
y_train 교차검증 개수: 
46046

-----5번째 교차검증-----
X_train 교차검증 개수: 
97918
y_train 교차검증 개수: 
45497



In [160]:
# 하이퍼 파라미터 설정
# 그리드 서치로 가장 좋은 하이퍼 파라미터 선정
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

param_grid = dict(n_estimators = [100,1000,10000], 
                  learning_rate = [0.1,0.01,0.001],
                 max_depth = [-1,1000,10000])

In [161]:
lgbm = LGBMRegressor()

In [162]:
grid_lgbm = GridSearchCV(lgbm, param_grid, cv = 5, verbose = 2, scoring = SMAPE, early_stopping_round = 100)
grid_lgbm.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] learning_rate=0.1, max_depth=-1, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.1, max_depth=-1, n_estimators=100, total=   0.5s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=-1, n_estimators=100, total=   0.5s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=100 ...............
[CV]  learning_rate=0.1, max_depth=-1, n_estimators=100, total=   0.5s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=100 ...............
[CV]  learning_rate=0.1, max_depth=-1, n_estimators=100, total=   0.4s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=100 ...............
[CV]  learning_rate=0.1, max_depth=-1, n_estimators=100, total=   0.4s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=1000 ..............
[CV]  learning_rate=0.1, max_depth=-1, n_estimators=1000, total=   2.6s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=1000 ..............
[CV]  learning_rate=0.1, max_depth=-1, n_estimators=1000, total=   2.7s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=1000 ..............
[CV]  learning_rate=0.1, max_depth=-1, n_estimators=1000, total=   2.5s
[CV] learning_rate=0.1, max_depth=-1, n_estimators=1000 ..............
[CV

[CV]  learning_rate=0.01, max_depth=-1, n_estimators=10000, total=  29.6s
[CV] learning_rate=0.01, max_depth=-1, n_estimators=10000 ............
[CV]  learning_rate=0.01, max_depth=-1, n_estimators=10000, total=  30.4s
[CV] learning_rate=0.01, max_depth=1000, n_estimators=100 ............
[CV]  learning_rate=0.01, max_depth=1000, n_estimators=100, total=   0.6s
[CV] learning_rate=0.01, max_depth=1000, n_estimators=100 ............
[CV]  learning_rate=0.01, max_depth=1000, n_estimators=100, total=   0.5s
[CV] learning_rate=0.01, max_depth=1000, n_estimators=100 ............
[CV]  learning_rate=0.01, max_depth=1000, n_estimators=100, total=   0.6s
[CV] learning_rate=0.01, max_depth=1000, n_estimators=100 ............
[CV]  learning_rate=0.01, max_depth=1000, n_estimators=100, total=   0.5s
[CV] learning_rate=0.01, max_depth=1000, n_estimators=100 ............
[CV]  learning_rate=0.01, max_depth=1000, n_estimators=100, total=   0.5s
[CV] learning_rate=0.01, max_depth=1000, n_estimators=10

[CV]  learning_rate=0.001, max_depth=1000, n_estimators=10000, total=  37.0s
[CV] learning_rate=0.001, max_depth=1000, n_estimators=10000 .........
[CV]  learning_rate=0.001, max_depth=1000, n_estimators=10000, total=  38.9s
[CV] learning_rate=0.001, max_depth=1000, n_estimators=10000 .........
[CV]  learning_rate=0.001, max_depth=1000, n_estimators=10000, total=  35.9s
[CV] learning_rate=0.001, max_depth=1000, n_estimators=10000 .........
[CV]  learning_rate=0.001, max_depth=1000, n_estimators=10000, total=  35.1s
[CV] learning_rate=0.001, max_depth=1000, n_estimators=10000 .........
[CV]  learning_rate=0.001, max_depth=1000, n_estimators=10000, total=  35.6s
[CV] learning_rate=0.001, max_depth=10000, n_estimators=100 ..........
[CV]  learning_rate=0.001, max_depth=10000, n_estimators=100, total=   0.5s
[CV] learning_rate=0.001, max_depth=10000, n_estimators=100 ..........
[CV]  learning_rate=0.001, max_depth=10000, n_estimators=100, total=   0.5s
[CV] learning_rate=0.001, max_depth=1

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 27.8min finished


GridSearchCV(cv=5, estimator=LGBMRegressor(),
             param_grid={'learning_rate': [0.1, 0.01, 0.001],
                         'max_depth': [-1, 1000, 10000],
                         'n_estimators': [100, 1000, 10000]},
             scoring=make_scorer(smape, greater_is_better=False), verbose=2)

In [163]:
# 최적의 하이퍼 파라미터
print(grid_lgbm.best_params_)
print(grid_lgbm.best_score_)

{'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000}
-0.2548294163788317


In [164]:
# 성능
grid_lgbm.score(X_val,y_val)

-0.07978504095978617

In [168]:
best_estimator = grid_lgbm.best_estimator_

In [169]:
best_estimator

LGBMRegressor(learning_rate=0.01, n_estimators=1000)

학습률 0.01, 노드 개수 1000개일 때 가장 점수가 높게 나타났다.

In [170]:
# 다른 성능 지표로도 확인
from sklearn.metrics import mean_squared_error, r2_score
y_pred = best_estimator.predict(X_train)


mse_score = mean_squared_error(y_train, y_pred)
r2_score = r2_score(y_train, y_pred)

In [171]:
print('MSE:', mse_score)
print('R2 :', r2_score)

MSE: 84554.92261442683
R2 : 0.9793812480214279


In [172]:
# 예측할 데이터
X_test.head()

Unnamed: 0,num,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,hour,week,weekend,pred
0,1,27.8,1.5,74.0,0.0,0.0,0.0,0.0,0,1,0,1
1,1,27.633333,1.366667,75.333333,0.0,0.0,0.0,0.0,1,1,0,1
2,1,27.466667,1.233333,76.666667,0.0,0.0,0.0,0.0,2,1,0,1
3,1,27.3,1.1,78.0,0.0,0.0,0.0,0.0,3,1,0,1
4,1,26.9,1.166667,79.666667,0.0,0.0,0.0,0.0,4,1,0,1


In [173]:
y_submission = pd.DataFrame(best_estimator.predict(X_test))

In [174]:
y_submission.to_csv('./y_submission.csv')