In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import mglearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression # 선형회귀
from sklearn.neighbors import KNeighborsRegressor # KNN 회귀
from sklearn.tree import DecisionTreeRegressor # 결정트리회귀
from sklearn.ensemble import RandomForestRegressor # 랜덤포레스트 회귀
from xgboost import XGBRegressor # GBT 병행학습
from lightgbm import LGBMRegressor # GBT
from sklearn.model_selection import cross_val_score # 교차검증
from sklearn.model_selection import KFold # KFold 교차검증

In [2]:
def nmae(true_df, pred_df):
    target_idx = true_df.iloc[:,0]
    pred_df = pred_df[pred_df.iloc[:,0].isin(target_idx)]
    pred_df = pred_df.sort_values(by=[pred_df.columns[0]], ascending=[True])
    true_df = true_df.sort_values(by=[true_df.columns[0]], ascending=[True])
    
    true = true_df.iloc[:,1].to_numpy()
    pred = pred_df.iloc[:,1].to_numpy()
    
    score = np.mean((np.abs(true-pred))/true)
    
    return score

In [3]:
total = pd.read_csv('./data/2013-2018년_가스공급량과_기온2.csv')

In [4]:
sub = pd.read_csv('./data/sample_submission.csv')

In [5]:
total = total.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', '연월일'])

In [6]:
total

Unnamed: 0,시간,구분,공급량,year,month,day,weekday,기온
0,1,0,2497.129,2013,1,1,1,-6.4
1,1,1,2169.093,2013,1,1,1,-6.4
2,1,2,226.178,2013,1,1,1,-6.4
3,1,3,1434.516,2013,1,1,1,-6.4
4,1,4,3272.837,2013,1,1,1,-6.4
...,...,...,...,...,...,...,...,...
368083,24,2,237.911,2018,12,31,0,-2.9
368084,24,3,1422.478,2018,12,31,0,-2.9
368085,24,4,3534.260,2018,12,31,0,-2.9
368086,24,5,3982.757,2018,12,31,0,-2.9


In [7]:
sub['일자'] = sub['일자|시간|구분'].str.split().str[0]
sub['시간'] = sub['일자|시간|구분'].str.split().str[1]
sub['구분'] = sub['일자|시간|구분'].str.split().str[2]
d_map = {}
for i, d in enumerate(sub['구분'].unique()):
    d_map[d] = i
sub['구분'] = sub['구분'].map(d_map)
sub['일자'] = pd.to_datetime(sub['일자'])
sub['year'] = sub['일자'].dt.year
sub['month'] = sub['일자'].dt.month
sub['day'] = sub['일자'].dt.day
sub['weekday'] = sub['일자'].dt.weekday

In [8]:
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,0,2019-01-01,01,0,2019,1,1,1
1,2019-01-01 02 A,0,2019-01-01,02,0,2019,1,1,1
2,2019-01-01 03 A,0,2019-01-01,03,0,2019,1,1,1
3,2019-01-01 04 A,0,2019-01-01,04,0,2019,1,1,1
4,2019-01-01 05 A,0,2019-01-01,05,0,2019,1,1,1
...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,0,2019-03-31,20,6,2019,3,31,6
15116,2019-03-31 21 H,0,2019-03-31,21,6,2019,3,31,6
15117,2019-03-31 22 H,0,2019-03-31,22,6,2019,3,31,6
15118,2019-03-31 23 H,0,2019-03-31,23,6,2019,3,31,6


In [9]:
X = total.loc[:, ['시간', 'day', 'month']]
y = total['기온']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [11]:
X_train

Unnamed: 0,시간,day,month
111594,7,27,10
140874,13,19,4
278684,21,17,7
338641,18,9,7
80117,22,22,4
...,...,...,...
359783,14,12,11
358083,11,2,11
152315,16,26,6
117952,3,4,12


In [12]:
y_train

111594     8.7
140874    17.1
278684    24.1
338641    25.0
80117     10.0
          ... 
359783    13.7
358083    13.7
152315    23.4
117952     2.4
305711     1.4
Name: 기온, Length: 276066, dtype: float64

In [13]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
kfold = KFold(shuffle=True, random_state=0)
score1 = cross_val_score(xgb, X_train, y_train, cv=kfold).mean()
score2 = cross_val_score(xgb, X_test, y_test, cv=kfold).mean()

In [14]:
print(f"훈련 셋 결정계수 = {score1}")
print(f"테스트 셋 결정계수 = {score2}")

훈련 셋 결정계수 = 0.9069062941975332
테스트 셋 결정계수 = 0.9060699390352909


In [15]:
col = ['month', 'day', '시간']
temp_X = sub[col]

In [16]:
temp_pred = xgb.predict(temp_X)
temp_pred

array([-1.2902024 ,  0.16236527,  4.2840543 , ..., -1.4141917 ,
       -1.4141917 , -1.4141917 ], dtype=float32)

In [17]:
sub['기온'] = temp_pred
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,0,2019-01-01,01,0,2019,1,1,1,-1.290202
1,2019-01-01 02 A,0,2019-01-01,02,0,2019,1,1,1,0.162365
2,2019-01-01 03 A,0,2019-01-01,03,0,2019,1,1,1,4.284054
3,2019-01-01 04 A,0,2019-01-01,04,0,2019,1,1,1,7.582557
4,2019-01-01 05 A,0,2019-01-01,05,0,2019,1,1,1,10.436240
...,...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,0,2019-03-31,20,6,2019,3,31,6,-1.414192
15116,2019-03-31 21 H,0,2019-03-31,21,6,2019,3,31,6,-1.414192
15117,2019-03-31 22 H,0,2019-03-31,22,6,2019,3,31,6,-1.414192
15118,2019-03-31 23 H,0,2019-03-31,23,6,2019,3,31,6,-1.414192


In [18]:
X_sub = sub[["시간", 'month', '기온']]

In [19]:
X_sub

Unnamed: 0,시간,month,기온
0,01,1,-1.290202
1,02,1,0.162365
2,03,1,4.284054
3,04,1,7.582557
4,05,1,10.436240
...,...,...,...
15115,20,3,-1.414192
15116,21,3,-1.414192
15117,22,3,-1.414192
15118,23,3,-1.414192


In [20]:
X = total[["시간", 'month', '기온']]
y = total['공급량']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [21]:
X_train

Unnamed: 0,시간,month,기온
111594,7,10,8.7
140874,13,4,17.1
278684,21,7,24.1
338641,18,7,25.0
80117,22,4,10.0
...,...,...,...
359783,14,11,13.7
358083,11,11,13.7
152315,16,6,23.4
117952,3,12,2.4


In [23]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
model_xgb.score(X_test, y_test)

0.35648751785055277

In [24]:
pred = model_xgb.predict(X_sub)

In [25]:
sub['공급량'] = pred
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,1507.125854,2019-01-01,01,0,2019,1,1,1,-1.290202
1,2019-01-01 02 A,1337.939697,2019-01-01,02,0,2019,1,1,1,0.162365
2,2019-01-01 03 A,1159.983887,2019-01-01,03,0,2019,1,1,1,4.284054
3,2019-01-01 04 A,1067.802979,2019-01-01,04,0,2019,1,1,1,7.582557
4,2019-01-01 05 A,1122.794067,2019-01-01,05,0,2019,1,1,1,10.436240
...,...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,2025.036011,2019-03-31,20,6,2019,3,31,6,-1.414192
15116,2019-03-31 21 H,2130.130615,2019-03-31,21,6,2019,3,31,6,-1.414192
15117,2019-03-31 22 H,1905.174194,2019-03-31,22,6,2019,3,31,6,-1.414192
15118,2019-03-31 23 H,1742.011841,2019-03-31,23,6,2019,3,31,6,-1.414192


In [27]:
sub = sub.loc[:, ['일자|시간|구분', '공급량']]

In [28]:
sub.to_csv('./data/xgboost_3.csv', index=False)

In [23]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [43]:
from sklearn.metrics import mean_squared_error

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [44]:
space={'max_depth': hp.quniform("max_depth", 3, 12, 3),
       'subsample': hp.quniform('subsample', 0.6, 0.8, 0.1),
       'colsample_bytree' : hp.quniform('colsample_bytree', 0.3, 0.7, 0.1),
       'n_estimators': hp.quniform('n_estimators', 100, 250, 50)
      }

In [47]:
def hyperparameter_tuning(space):
    model=XGBRegressor(n_estimators =int(space['n_estimators']), 
                       max_depth = int(space['max_depth']), 
                       subsample = space['subsample'],
                       colsample_bytree = space['colsample_bytree'],
                       random_state=777
                      )
    
    evaluation = [(X_train, y_train), (X_test, y_test)]
    
    model.fit(X_train, y_train,
              eval_set=evaluation, 
              eval_metric="rmse",
              early_stopping_rounds=20,
              verbose=0)

    pred = model.predict(X_test)
    rmse= RMSE(y_test, pred)   
    # 평가 방식 선정
    return {'loss':rmse, 'status': STATUS_OK, 'model': model}

In [48]:
# Trials 객체 선언합니다.
trials = Trials()
# best에 최적의 하이퍼 파라미터를 return 받습니다.
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials)

# 최적화된 결과를 int로 변환해야하는 파라미터는 타입 변환을 수행합니다.
best['max_depth'] = int(best['max_depth'])
best['n_estimators'] = int(best['n_estimators'])
print (best)

100%|██████████████████████████████████████████████████| 50/50 [08:26<00:00, 10.14s/trial, best loss: 181.059582062392]
{'colsample_bytree': 0.7000000000000001, 'max_depth': 12, 'n_estimators': 250, 'subsample': 0.8}


In [50]:
model = XGBRegressor(n_estimators=250,
                    max_depth=12,
                    subsample=0.8,
                    colsample_bytree=0.7,
                    learning_rate = 0.05)

In [51]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=12, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=250, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [56]:
pred = model.predict(X_sub)

In [55]:
model.score(X_test, y_test)

0.961029328472341

In [57]:
pred

array([1977.262  , 2083.3193 , 1388.9834 , ...,  582.4141 ,  513.29126,
        428.29764], dtype=float32)

In [58]:
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,0,2019-01-01,01,0,2019,1,1,1,-1.290202
1,2019-01-01 02 A,0,2019-01-01,02,0,2019,1,1,1,0.162365
2,2019-01-01 03 A,0,2019-01-01,03,0,2019,1,1,1,4.284054
3,2019-01-01 04 A,0,2019-01-01,04,0,2019,1,1,1,7.582557
4,2019-01-01 05 A,0,2019-01-01,05,0,2019,1,1,1,10.436240
...,...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,0,2019-03-31,20,6,2019,3,31,6,-1.414192
15116,2019-03-31 21 H,0,2019-03-31,21,6,2019,3,31,6,-1.414192
15117,2019-03-31 22 H,0,2019-03-31,22,6,2019,3,31,6,-1.414192
15118,2019-03-31 23 H,0,2019-03-31,23,6,2019,3,31,6,-1.414192


In [59]:
sub['공급량'] = pred

In [60]:
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday,기온
0,2019-01-01 01 A,1977.261963,2019-01-01,01,0,2019,1,1,1,-1.290202
1,2019-01-01 02 A,2083.319336,2019-01-01,02,0,2019,1,1,1,0.162365
2,2019-01-01 03 A,1388.983398,2019-01-01,03,0,2019,1,1,1,4.284054
3,2019-01-01 04 A,1192.166626,2019-01-01,04,0,2019,1,1,1,7.582557
4,2019-01-01 05 A,1207.223877,2019-01-01,05,0,2019,1,1,1,10.436240
...,...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,671.220032,2019-03-31,20,6,2019,3,31,6,-1.414192
15116,2019-03-31 21 H,710.589661,2019-03-31,21,6,2019,3,31,6,-1.414192
15117,2019-03-31 22 H,582.414124,2019-03-31,22,6,2019,3,31,6,-1.414192
15118,2019-03-31 23 H,513.291260,2019-03-31,23,6,2019,3,31,6,-1.414192


In [61]:
sub = sub[['일자|시간|구분', '공급량']]

In [62]:
sub

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1977.261963
1,2019-01-01 02 A,2083.319336
2,2019-01-01 03 A,1388.983398
3,2019-01-01 04 A,1192.166626
4,2019-01-01 05 A,1207.223877
...,...,...
15115,2019-03-31 20 H,671.220032
15116,2019-03-31 21 H,710.589661
15117,2019-03-31 22 H,582.414124
15118,2019-03-31 23 H,513.291260


In [63]:
sub.to_csv('./data/xgboost_hyperopt.csv', index=False)

In [68]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error

# regularization candiate 정의
reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

# space 정의, Hyperparameter의 이름을 key 값으로 입력
space={'max_depth': hp.quniform("max_depth", 5, 15, 1),
       'learning_rate': hp.quniform ('learning_rate', 0.01, 0.05, 0.005),
       'reg_alpha' : hp.choice('reg_alpha', reg_candidate),
       'reg_lambda' : hp.choice('reg_lambda', reg_candidate),
       'subsample': hp.quniform('subsample', 0.6, 1, 0.05),
       'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.05),
       'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
       'n_estimators': hp.quniform('n_estimators', 200, 1500, 100)
      }

# 목적 함수 정의
# n_estimators, max_depth와 같은 반드시 int 타입을 가져야 하는 hyperparamter는 int로 타입 캐스팅 합니다.
def hyperparameter_tuning(space):
    model=XGBRegressor(n_estimators =int(space['n_estimators']), 
                       max_depth = int(space['max_depth']), 
                       learning_rate = space['learning_rate'],
                       reg_alpha = space['reg_alpha'],
                       reg_lambda = space['reg_lambda'],
                       subsample = space['subsample'],
                       colsample_bytree = space['colsample_bytree'], 
                       min_child_weight = int(space['min_child_weight']),
                      )
    
    evaluation = [(X_train, y_train), (X_test, y_test)]
    
    model.fit(X_train, y_train,
              eval_set=evaluation, 
              eval_metric="rmse",
              early_stopping_rounds=20,
              verbose=0)

    pred = model.predict(X_test)
    rmse= RMSE(y_test, pred)    
    # 평가 방식 선정
    return {'loss':rmse, 'status': STATUS_OK, 'model': model}

In [69]:
# Trials 객체 선언합니다.
trials = Trials()
# best에 최적의 하이퍼 파라미터를 return 받습니다.
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials)

# 최적화된 결과를 int로 변환해야하는 파라미터는 타입 변환을 수행합니다.
best['max_depth'] = int(best['max_depth'])
best['min_child_weight'] = int(best['min_child_weight'])
best['n_estimators'] = int(best['n_estimators'])
best['reg_alpha'] = reg_candidate[int(best['reg_alpha'])]
best['reg_lambda'] = reg_candidate[int(best['reg_lambda'])]
print (best)

100%|████████████████████████████████████████████████| 50/50 [42:43<00:00, 51.27s/trial, best loss: 170.75220489674362]
{'colsample_bytree': 0.75, 'learning_rate': 0.035, 'max_depth': 13, 'min_child_weight': 2, 'n_estimators': 1300, 'reg_alpha': 5, 'reg_lambda': 5, 'subsample': 1.0}


In [70]:
model=XGBRegressor(n_estimators = 1300, 
                       max_depth = 13, 
                       learning_rate = 0.035,
                       reg_alpha = 5,
                       reg_lambda = 5,
                       subsample = 1.0,
                       colsample_bytree = 0.75, 
                       min_child_weight = 2
                      )

In [71]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.75,
             enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.035, max_delta_step=0,
             max_depth=13, min_child_weight=2, missing=nan,
             monotone_constraints='()', n_estimators=1300, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=5,
             reg_lambda=5, scale_pos_weight=1, subsample=1.0,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [72]:
model.score(X_test, y_test)

0.9664478961459128

In [75]:
X_sub

Unnamed: 0,시간,구분,month,기온
0,01,0,1,-1.290202
1,02,0,1,0.162365
2,03,0,1,4.284054
3,04,0,1,7.582557
4,05,0,1,10.436240
...,...,...,...,...
15115,20,6,3,-1.414192
15116,21,6,3,-1.414192
15117,22,6,3,-1.414192
15118,23,6,3,-1.414192


In [73]:
pred = model.predict(X_sub)

In [74]:
sub['공급량'] = pred
sub

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['공급량'] = pred


Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2046.595581
1,2019-01-01 02 A,1764.973511
2,2019-01-01 03 A,1366.938721
3,2019-01-01 04 A,1253.901733
4,2019-01-01 05 A,1308.986572
...,...,...
15115,2019-03-31 20 H,648.791992
15116,2019-03-31 21 H,684.039551
15117,2019-03-31 22 H,544.907288
15118,2019-03-31 23 H,538.100159


In [76]:
sub.to_csv('./data/xgboost_hyperopt2.csv', index=False)