# 사용 패키지

In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb

# 데이터 로드

In [3]:
total = pd.read_csv('./data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [4]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,A,2497.129
1,2013-01-01,2,A,2363.265
2,2013-01-01,3,A,2258.505
3,2013-01-01,4,A,2243.969
4,2013-01-01,5,A,2344.105


# 전처리

In [5]:
total['구분'].unique()

array(['A', 'B', 'C', 'D', 'E', 'G', 'H'], dtype=object)

In [6]:
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [7]:
total['연월일'] = pd.to_datetime(total['연월일'])

In [8]:
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [9]:
print('total:\n', total)

total:
               연월일  시간  구분       공급량  year  month  day  weekday
0      2013-01-01   1   0  2497.129  2013      1    1        1
1      2013-01-01   2   0  2363.265  2013      1    1        1
2      2013-01-01   3   0  2258.505  2013      1    1        1
3      2013-01-01   4   0  2243.969  2013      1    1        1
4      2013-01-01   5   0  2344.105  2013      1    1        1
...           ...  ..  ..       ...   ...    ...  ...      ...
368083 2018-12-31  20   6   681.033  2018     12   31        0
368084 2018-12-31  21   6   669.961  2018     12   31        0
368085 2018-12-31  22   6   657.941  2018     12   31        0
368086 2018-12-31  23   6   610.953  2018     12   31        0
368087 2018-12-31  24   6   560.896  2018     12   31        0

[368088 rows x 8 columns]


In [10]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]

In [11]:
train = total[total['year'].isin(train_years)]
val = total[total['year'].isin(val_years)]

In [12]:
features = ['구분', 'month', 'day', 'weekday', '시간']
train_x = train[features]
train_y = train['공급량']

val_x = val[features]
val_y = val['공급량']

# 학습

In [13]:
from xgboost import XGBRegressor

dtrain = xgb.DMatrix(data=train_x, label = train_y)
dval = xgb.DMatrix(data=val_x, label = val_y)
wlist = [(dtrain, 'train'), (dval,'eval')]

params = {
    'learning_rate': 0.1,
    'objective': 'reg:squarederror',
    'metric':'mae', 
    'seed':42
}
 

model = xgb.train( params, dtrain, 800, evals=wlist, verbose_eval=20, early_stopping_rounds=100)

Parameters: { "metric" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:1181.85229	eval-rmse:1309.06726
[20]	train-rmse:301.33563	eval-rmse:408.91522
[40]	train-rmse:215.21423	eval-rmse:302.80649
[60]	train-rmse:196.05160	eval-rmse:279.65628
[80]	train-rmse:186.16161	eval-rmse:269.39755
[100]	train-rmse:180.14180	eval-rmse:265.21967
[120]	train-rmse:175.67668	eval-rmse:262.43545
[140]	train-rmse:171.75839	eval-rmse:262.24088
[160]	train-rmse:167.87944	eval-rmse:261.69229
[180]	train-rmse:163.57802	eval-rmse:260.81625
[200]	train-rmse:159.45673	eval-rmse:260.52603
[220]	train-rmse:156.87802	eval-rmse:260.12982
[240]	train-rmse:153.54976	eval-rmse:259.73215
[260]	train-rmse:150.07008	eval-rmse:260.05292
[280]	train-rmse:147.23982	eval-rmse:260.18890
[300]	train-rms

# 추론 및 결과 제출

In [14]:
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [15]:
test.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [16]:
submission.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [17]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [18]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [19]:
test['구분'] = test['구분'].map(d_map)

In [20]:
test_x = test[features]

In [21]:
test_x

Unnamed: 0,구분,month,day,weekday,시간
0,0,1,1,1,1
1,0,1,1,1,2
2,0,1,1,1,3
3,0,1,1,1,4
4,0,1,1,1,5
...,...,...,...,...,...
15115,6,3,31,6,20
15116,6,3,31,6,21
15117,6,3,31,6,22
15118,6,3,31,6,23


In [23]:
test_x = xgb.DMatrix(test_x)
preds = model.predict(test_x)

In [24]:
submission['공급량'] = preds

In [25]:
submission.to_csv('xgbresult.csv', index=False)