In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import mglearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression # 선형회귀
from sklearn.neighbors import KNeighborsRegressor # KNN 회귀
from sklearn.tree import DecisionTreeRegressor # 결정트리회귀
from sklearn.ensemble import RandomForestRegressor # 랜덤포레스트 회귀
from xgboost import XGBRegressor # GBT 병행학습
from lightgbm import LGBMRegressor # GBT
from sklearn.model_selection import cross_val_score # 교차검증
from sklearn.model_selection import KFold # KFold 교차검증

In [5]:
total = pd.read_csv('./data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [6]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   연월일     368088 non-null  object 
 1   시간      368088 non-null  int64  
 2   구분      368088 non-null  object 
 3   공급량     368088 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 11.2+ MB


In [7]:
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [8]:
total['연월일'] = pd.to_datetime(total['연월일'])

In [9]:
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [10]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량,year,month,day,weekday
0,2013-01-01,1,0,2497.129,2013,1,1,1
1,2013-01-01,2,0,2363.265,2013,1,1,1
2,2013-01-01,3,0,2258.505,2013,1,1,1
3,2013-01-01,4,0,2243.969,2013,1,1,1
4,2013-01-01,5,0,2344.105,2013,1,1,1


In [22]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   연월일      368088 non-null  datetime64[ns]
 1   시간       368088 non-null  int64         
 2   구분       368088 non-null  int64         
 3   공급량      368088 non-null  float64       
 4   year     368088 non-null  int64         
 5   month    368088 non-null  int64         
 6   day      368088 non-null  int64         
 7   weekday  368088 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(6)
memory usage: 22.5 MB


In [16]:
X = total.drop(columns=['공급량', '연월일'])
y = total['공급량']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [18]:
X_train

Unnamed: 0,시간,구분,year,month,day,weekday
111594,19,5,2014,9,27,5
140874,19,2,2015,1,30,4
278684,21,3,2017,10,17,1
338641,2,3,2018,8,22,2
80117,6,2,2014,2,23,6
...,...,...,...,...,...,...
359783,24,6,2018,1,19,4
358083,4,5,2018,11,10,5
152315,12,3,2015,5,22,4
117952,17,6,2014,6,19,3


In [19]:
X_test

Unnamed: 0,시간,구분,year,month,day,weekday
341261,6,3,2018,12,9,6
209561,18,2,2016,11,30,2
254423,24,1,2017,1,9,0
25367,24,2,2013,11,23,5
271447,8,2,2017,12,20,2
...,...,...,...,...,...,...
145472,9,2,2015,8,10,0
12603,4,1,2013,6,10,0
202648,17,2,2016,2,16,1
148460,21,2,2015,12,12,5


In [20]:
y_train

111594    1280.501
140874     235.778
278684     565.001
338641     119.918
80117      174.178
            ...   
359783     463.290
358083    1458.782
152315     441.413
117952     119.316
305711     418.361
Name: 공급량, Length: 276066, dtype: float64

In [21]:
y_test

341261    1393.327
209561     170.978
254423    1778.549
25367      162.178
271447     285.617
            ...   
145472      98.978
12603       95.333
202648     179.778
148460     139.778
37239     1420.221
Name: 공급량, Length: 92022, dtype: float64

In [23]:
model_linear = LinearRegression()
model_knn = KNeighborsRegressor()
model_tree = DecisionTreeRegressor()
model_random = RandomForestRegressor()
model_xgb = XGBRegressor()
model_lgb = LGBMRegressor()

In [24]:
model_linear.fit(X_train, y_train)
model_knn.fit(X_train, y_train)
model_tree.fit(X_train, y_train)
model_random.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)

LGBMRegressor()

In [25]:
print(model_linear.score(X_test, y_test))
print(model_knn.score(X_test, y_test))
print(model_tree.score(X_test, y_test))
print(model_random.score(X_test, y_test))
print(model_xgb.score(X_test, y_test))
print(model_lgb.score(X_test, y_test))

0.03789955959459701
0.7736978870476603
0.9837829942914313
0.9908948790843859
0.9832708309238951
0.9716443088489706


In [27]:
scores = cross_val_score(model_linear, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_linear,scores))
scores = cross_val_score(model_knn, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_knn,scores))
scores = cross_val_score(model_tree, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_tree,scores))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))
scores = cross_val_score(model_xgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_xgb,scores))
scores = cross_val_score(model_lgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_lgb,scores))

LinearRegression() 교차 검증 점수 : [0.04162186 0.03716397 0.03736707 0.03972713 0.03370719]
KNeighborsRegressor() 교차 검증 점수 : [0.67675394 0.65513748 0.66452592 0.6593895  0.66572428]
DecisionTreeRegressor() 교차 검증 점수 : [0.95409252 0.95726286 0.9637787  0.95900953 0.955233  ]
RandomForestRegressor() 교차 검증 점수 : [0.9766035  0.97423775 0.97918187 0.97912476 0.97674989]
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 교차 검증 점수 