In [93]:
import numpy as np
import pandas as pd
import seaborn as sns
import mglearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression # 선형회귀
from sklearn.neighbors import KNeighborsRegressor # KNN 회귀
from sklearn.tree import DecisionTreeRegressor # 결정트리회귀
from sklearn.ensemble import RandomForestRegressor # 랜덤포레스트 회귀
from xgboost import XGBRegressor # GBT 병행학습
from lightgbm import LGBMRegressor # GBT
from sklearn.model_selection import cross_val_score # 교차검증
from sklearn.model_selection import KFold # KFold 교차검증

In [2]:
total = pd.read_csv('./data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [3]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   연월일     368088 non-null  object 
 1   시간      368088 non-null  int64  
 2   구분      368088 non-null  object 
 3   공급량     368088 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 11.2+ MB


In [4]:
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [5]:
total['연월일'] = pd.to_datetime(total['연월일'])

In [6]:
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [7]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량,year,month,day,weekday
0,2013-01-01,1,0,2497.129,2013,1,1,1
1,2013-01-01,2,0,2363.265,2013,1,1,1
2,2013-01-01,3,0,2258.505,2013,1,1,1
3,2013-01-01,4,0,2243.969,2013,1,1,1
4,2013-01-01,5,0,2344.105,2013,1,1,1


In [8]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   연월일      368088 non-null  datetime64[ns]
 1   시간       368088 non-null  int64         
 2   구분       368088 non-null  int64         
 3   공급량      368088 non-null  float64       
 4   year     368088 non-null  int64         
 5   month    368088 non-null  int64         
 6   day      368088 non-null  int64         
 7   weekday  368088 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(6)
memory usage: 22.5 MB


In [9]:
X = total.drop(columns=['공급량', '연월일'])
y = total['공급량']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [11]:
X_train

Unnamed: 0,시간,구분,year,month,day,weekday
111594,19,5,2014,9,27,5
140874,19,2,2015,1,30,4
278684,21,3,2017,10,17,1
338641,2,3,2018,8,22,2
80117,6,2,2014,2,23,6
...,...,...,...,...,...,...
359783,24,6,2018,1,19,4
358083,4,5,2018,11,10,5
152315,12,3,2015,5,22,4
117952,17,6,2014,6,19,3


In [12]:
X_test

Unnamed: 0,시간,구분,year,month,day,weekday
341261,6,3,2018,12,9,6
209561,18,2,2016,11,30,2
254423,24,1,2017,1,9,0
25367,24,2,2013,11,23,5
271447,8,2,2017,12,20,2
...,...,...,...,...,...,...
145472,9,2,2015,8,10,0
12603,4,1,2013,6,10,0
202648,17,2,2016,2,16,1
148460,21,2,2015,12,12,5


In [13]:
y_train

111594    1280.501
140874     235.778
278684     565.001
338641     119.918
80117      174.178
            ...   
359783     463.290
358083    1458.782
152315     441.413
117952     119.316
305711     418.361
Name: 공급량, Length: 276066, dtype: float64

In [14]:
y_test

341261    1393.327
209561     170.978
254423    1778.549
25367      162.178
271447     285.617
            ...   
145472      98.978
12603       95.333
202648     179.778
148460     139.778
37239     1420.221
Name: 공급량, Length: 92022, dtype: float64

In [15]:
model_linear = LinearRegression()
model_knn = KNeighborsRegressor()
model_tree = DecisionTreeRegressor()
model_random = RandomForestRegressor()
model_xgb = XGBRegressor()
model_lgb = LGBMRegressor()

In [16]:
model_linear.fit(X_train, y_train)
model_knn.fit(X_train, y_train)
model_tree.fit(X_train, y_train)
model_random.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)

LGBMRegressor()

In [17]:
print(model_linear.score(X_test, y_test))
print(model_knn.score(X_test, y_test))
print(model_tree.score(X_test, y_test))
print(model_random.score(X_test, y_test))
print(model_xgb.score(X_test, y_test))
print(model_lgb.score(X_test, y_test))

0.037899559594597454
0.7736978870476603
0.9837068376336889
0.9909273114625843
0.9832708309238951
0.9716443088489706


In [18]:
scores = cross_val_score(model_linear, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_linear,scores))
scores = cross_val_score(model_knn, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_knn,scores))
scores = cross_val_score(model_tree, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_tree,scores))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))
scores = cross_val_score(model_xgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_xgb,scores))
scores = cross_val_score(model_lgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_lgb,scores))

LinearRegression() 교차 검증 점수 : [0.04162186 0.03716397 0.03736707 0.03972713 0.03370719]
KNeighborsRegressor() 교차 검증 점수 : [0.67675394 0.65513748 0.66452592 0.6593895  0.66572428]
DecisionTreeRegressor() 교차 검증 점수 : [0.95402424 0.95747597 0.96327034 0.95964736 0.9558149 ]
RandomForestRegressor() 교차 검증 점수 : [0.97625736 0.97412779 0.97919697 0.97922324 0.97568182]
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 교차 검증 점수 

In [79]:
sub = pd.read_csv('./data/sample_submission.csv')

In [80]:
sub

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0
...,...,...
15115,2019-03-31 20 H,0
15116,2019-03-31 21 H,0
15117,2019-03-31 22 H,0
15118,2019-03-31 23 H,0


In [81]:
sub['일자'] = sub['일자|시간|구분'].str.split().str[0]

In [82]:
sub['일자|시간|구분'].str.split()

0        [2019-01-01, 01, A]
1        [2019-01-01, 02, A]
2        [2019-01-01, 03, A]
3        [2019-01-01, 04, A]
4        [2019-01-01, 05, A]
                ...         
15115    [2019-03-31, 20, H]
15116    [2019-03-31, 21, H]
15117    [2019-03-31, 22, H]
15118    [2019-03-31, 23, H]
15119    [2019-03-31, 24, H]
Name: 일자|시간|구분, Length: 15120, dtype: object

In [83]:
sub['시간'] = sub['일자|시간|구분'].str.split().str[1]

In [84]:
sub['구분'] = sub['일자|시간|구분'].str.split().str[2]

In [85]:
d_map = {}
for i, d in enumerate(sub['구분'].unique()):
    d_map[d] = i
sub['구분'] = sub['구분'].map(d_map)

In [86]:
sub['일자'] = pd.to_datetime(sub['일자'])

In [87]:
sub['year'] = sub['일자'].dt.year
sub['month'] = sub['일자'].dt.month
sub['day'] = sub['일자'].dt.day
sub['weekday'] = sub['일자'].dt.weekday

In [88]:
X_sub = sub[["시간", '구분', 'year', 'month', 'day', 'weekday']]

In [98]:
X_sub

Unnamed: 0,시간,구분,year,month,day,weekday
0,01,0,2019,1,1,1
1,02,0,2019,1,1,1
2,03,0,2019,1,1,1
3,04,0,2019,1,1,1
4,05,0,2019,1,1,1
...,...,...,...,...,...,...
15115,20,6,2019,3,31,6
15116,21,6,2019,3,31,6
15117,22,6,2019,3,31,6
15118,23,6,2019,3,31,6


In [90]:
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,0,2019-01-01,01,0,2019,1,1,1
1,2019-01-01 02 A,0,2019-01-01,02,0,2019,1,1,1
2,2019-01-01 03 A,0,2019-01-01,03,0,2019,1,1,1
3,2019-01-01 04 A,0,2019-01-01,04,0,2019,1,1,1
4,2019-01-01 05 A,0,2019-01-01,05,0,2019,1,1,1
...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,0,2019-03-31,20,6,2019,3,31,6
15116,2019-03-31 21 H,0,2019-03-31,21,6,2019,3,31,6
15117,2019-03-31 22 H,0,2019-03-31,22,6,2019,3,31,6
15118,2019-03-31 23 H,0,2019-03-31,23,6,2019,3,31,6


In [94]:
model_random = RandomForestRegressor(max_depth= 12, max_features= 'auto', min_samples_split= 0.01, n_estimators= 200)

In [96]:
model_random

RandomForestRegressor(max_depth=12, min_samples_split=0.01, n_estimators=200)

In [100]:
model_random.fit(X_train, y_train)
print(model_random.score(X_test, y_test))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))

0.8998167343977403
RandomForestRegressor(max_depth=12, min_samples_split=0.01, n_estimators=200) 교차 검증 점수 : [0.90164452 0.89785026 0.90258528 0.90089884 0.90227773]


In [103]:
pred = model_random.predict(X_sub)

In [104]:
pred

array([1682.18485848, 1682.18485848, 1682.18485848, ...,  331.79239567,
        331.79239567,  331.79239567])

In [105]:
sub['공급량'] = pred

In [106]:
sub['공급량'].unique()

array([1682.18485848, 2544.39045135, 2270.90220621, 2216.34778678,
       2230.94011078, 1559.61512354, 1524.40159128, 1417.35730523,
       1170.99464393, 2244.8263917 , 1925.64693773, 1940.23926174,
        218.64716166,  233.78324178,  156.83246612, 1067.38262382,
       1070.74311873, 1092.69497673, 1275.82825397, 1470.07689172,
        872.54353441, 2322.14946362, 2455.21584977, 2778.3975713 ,
       3175.70682842, 1820.74159227, 2834.10105523, 3920.60303082,
       3551.13017432, 2571.36204049,  406.34400996,  495.92733735,
        500.88660568,  494.82995469,  561.8224196 ,  331.79239567])

In [107]:
sub

Unnamed: 0,일자|시간|구분,공급량,일자,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,1682.184858,2019-01-01,01,0,2019,1,1,1
1,2019-01-01 02 A,1682.184858,2019-01-01,02,0,2019,1,1,1
2,2019-01-01 03 A,1682.184858,2019-01-01,03,0,2019,1,1,1
3,2019-01-01 04 A,1682.184858,2019-01-01,04,0,2019,1,1,1
4,2019-01-01 05 A,1682.184858,2019-01-01,05,0,2019,1,1,1
...,...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,331.792396,2019-03-31,20,6,2019,3,31,6
15116,2019-03-31 21 H,331.792396,2019-03-31,21,6,2019,3,31,6
15117,2019-03-31 22 H,331.792396,2019-03-31,22,6,2019,3,31,6
15118,2019-03-31 23 H,331.792396,2019-03-31,23,6,2019,3,31,6


In [108]:
sub = sub[['일자|시간|구분', '공급량']]

In [112]:
sub

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1682.184858
1,2019-01-01 02 A,1682.184858
2,2019-01-01 03 A,1682.184858
3,2019-01-01 04 A,1682.184858
4,2019-01-01 05 A,1682.184858
...,...,...
15115,2019-03-31 20 H,331.792396
15116,2019-03-31 21 H,331.792396
15117,2019-03-31 22 H,331.792396
15118,2019-03-31 23 H,331.792396


In [113]:
sub.to_csv('./data/random_submission.csv', index=False)

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 6, 9, 12],
    'min_samples_split': [0.01, 0.05, 0.1]
}

In [22]:
kf = KFold(random_state=30,
           n_splits=10,
           shuffle=True,
          )

In [23]:
grid_search = GridSearchCV(estimator=model_lgb, 
                           param_grid=param_grid, 
                           cv=kf, 
                           n_jobs=-1, 
                           verbose=2
                          )

In [24]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


GridSearchCV(cv=KFold(n_splits=10, random_state=30, shuffle=True),
             estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'max_depth': [None, 6, 9, 12],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_split': [0.01, 0.05, 0.1],
                         'n_estimators': [100, 150, 200, 250]},
             verbose=2)

In [25]:
grid_search.best_params_

{'max_depth': None,
 'max_features': 'auto',
 'min_samples_split': 0.01,
 'n_estimators': 250}

In [26]:
model_lgb = LGBMRegressor(max_depth=None, max_features='auto', min_samples_split=0.01, n_estimators=250)

In [27]:
model_lgb.fit(X_train, y_train)

LGBMRegressor(max_depth=None, max_features='auto', min_samples_split=0.01,
              n_estimators=250)

In [30]:
model_lgb.score(X_test, y_test)

0.9821439321117723

In [29]:
scores = cross_val_score(model_lgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_lgb,scores))

LGBMRegressor(max_depth=None, max_features='auto', min_samples_split=0.01,
              n_estimators=250) 교차 검증 점수 : [0.98231742 0.97633496 0.98271169 0.98338687 0.98167819]


In [23]:
grid_search = GridSearchCV(estimator=model_random, 
                           param_grid=param_grid, 
                           cv=kf, 
                           n_jobs=-1, 
                           verbose=2
                          )

In [None]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [None]:
grid_search.best_params_