In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [10]:
total = pd.read_csv('./data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [12]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,A,2497.129
1,2013-01-01,2,A,2363.265
2,2013-01-01,3,A,2258.505
3,2013-01-01,4,A,2243.969
4,2013-01-01,5,A,2344.105


In [13]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   연월일     368088 non-null  object 
 1   시간      368088 non-null  int64  
 2   구분      368088 non-null  object 
 3   공급량     368088 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 11.2+ MB


In [14]:
total['구분'].unique()

array(['A', 'B', 'C', 'D', 'E', 'G', 'H'], dtype=object)

In [15]:
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [16]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,0,2497.129
1,2013-01-01,2,0,2363.265
2,2013-01-01,3,0,2258.505
3,2013-01-01,4,0,2243.969
4,2013-01-01,5,0,2344.105


In [17]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   연월일     368088 non-null  object 
 1   시간      368088 non-null  int64  
 2   구분      368088 non-null  int64  
 3   공급량     368088 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 11.2+ MB


In [18]:
total['연월일'] = pd.to_datetime(total['연월일'])

In [19]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,0,2497.129
1,2013-01-01,2,0,2363.265
2,2013-01-01,3,0,2258.505
3,2013-01-01,4,0,2243.969
4,2013-01-01,5,0,2344.105


In [20]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   연월일     368088 non-null  datetime64[ns]
 1   시간      368088 non-null  int64         
 2   구분      368088 non-null  int64         
 3   공급량     368088 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 11.2 MB


In [21]:
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [40]:
total.head(60)

Unnamed: 0,연월일,시간,구분,공급량,year,month,day,weekday
0,2013-01-01,1,0,2497.129,2013,1,1,1
1,2013-01-01,2,0,2363.265,2013,1,1,1
2,2013-01-01,3,0,2258.505,2013,1,1,1
3,2013-01-01,4,0,2243.969,2013,1,1,1
4,2013-01-01,5,0,2344.105,2013,1,1,1
5,2013-01-01,6,0,2390.961,2013,1,1,1
6,2013-01-01,7,0,2378.457,2013,1,1,1
7,2013-01-01,8,0,2518.921,2013,1,1,1
8,2013-01-01,9,0,2706.481,2013,1,1,1
9,2013-01-01,10,0,2832.057,2013,1,1,1


In [23]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]

In [24]:
train = total[total['year'].isin(train_years)]
val = total[total['year'].isin(val_years)]

In [25]:
features = ['구분', 'month', 'day', 'weekday', '시간']
train_x = train[features]
train_y = train['공급량']

val_x = val[features]
val_y = val['공급량']

In [26]:
d_train = lgb.Dataset(train_x, train_y)
d_val = lgb.Dataset(val_x, val_y)

params = {
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 306768, number of used features: 5
[LightGBM] [Info] Start training from score 934.864036
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l1: 244.857
[40]	valid_0's l1: 174.855
[60]	valid_0's l1: 158.739
[80]	valid_0's l1: 153.323
[100]	valid_0's l1: 150.948
[120]	valid_0's l1: 150.463
Early stopping, best iteration is:
[112]	valid_0's l1: 150.297


In [27]:
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [28]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [29]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [30]:
test

Unnamed: 0,일자|시간|구분,일자,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,A,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,A,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,A,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,A,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,A,2019,1,1,1
...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,2019-03-31,20,H,2019,3,31,6
15116,2019-03-31 21 H,2019-03-31,21,H,2019,3,31,6
15117,2019-03-31 22 H,2019-03-31,22,H,2019,3,31,6
15118,2019-03-31 23 H,2019-03-31,23,H,2019,3,31,6


In [31]:
test['구분'] = test['구분'].map(d_map)

In [32]:
test

Unnamed: 0,일자|시간|구분,일자,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1
...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,2019-03-31,20,6,2019,3,31,6
15116,2019-03-31 21 H,2019-03-31,21,6,2019,3,31,6
15117,2019-03-31 22 H,2019-03-31,22,6,2019,3,31,6
15118,2019-03-31 23 H,2019-03-31,23,6,2019,3,31,6


In [33]:
test_x = test[features]

In [34]:
test_x

Unnamed: 0,구분,month,day,weekday,시간
0,0,1,1,1,1
1,0,1,1,1,2
2,0,1,1,1,3
3,0,1,1,1,4
4,0,1,1,1,5
...,...,...,...,...,...
15115,6,3,31,6,20
15116,6,3,31,6,21
15117,6,3,31,6,22
15118,6,3,31,6,23


In [35]:
preds = model.predict(test_x)

In [36]:
preds

array([2021.90989755, 1868.18592847, 1795.53044693, ...,  406.09858883,
        344.83381381,  328.48504601])

In [37]:
submission['공급량'] = preds

In [39]:
submission.to_csv('./data/baseline.csv', index=False)