In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mglearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
total = pd.read_csv('./data/한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [3]:
total

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,A,2497.129
1,2013-01-01,2,A,2363.265
2,2013-01-01,3,A,2258.505
3,2013-01-01,4,A,2243.969
4,2013-01-01,5,A,2344.105
...,...,...,...,...
368083,2018-12-31,20,H,681.033
368084,2018-12-31,21,H,669.961
368085,2018-12-31,22,H,657.941
368086,2018-12-31,23,H,610.953


In [4]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   연월일     368088 non-null  object 
 1   시간      368088 non-null  int64  
 2   구분      368088 non-null  object 
 3   공급량     368088 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 11.2+ MB


In [5]:
total['구분'].unique()

array(['A', 'B', 'C', 'D', 'E', 'G', 'H'], dtype=object)

In [6]:
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [7]:
d_map

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'G': 5, 'H': 6}

In [9]:
total['연월일'] = pd.to_datetime(total['연월일'])

In [10]:
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [18]:
train_years = [2013,2014,2015,2016,2017]
test_years = [2018]

In [19]:
train = total[total['year'].isin(train_years)]
test = total[total['year'].isin(test_years)]

In [22]:
features = ['구분', 'month', 'day', 'weekday', '시간']
X_train = train[features]
y_train = train['공급량']

X_test = test[features]
y_test = test['공급량']

In [23]:
model = LinearRegression().fit(X_train, y_train)

In [24]:
pred = model.predict(X_test)

In [25]:
pred

array([ 990.61106143, 1003.84760103, 1017.08414063, ...,  967.57343791,
        980.80997751,  994.04651711])

In [33]:
model.score(X_test, y_test)

0.03595336887654166

In [26]:
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [27]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [28]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [29]:
test['구분'] = test['구분'].map(d_map)

In [30]:
test

Unnamed: 0,일자|시간|구분,일자,시간,구분,year,month,day,weekday
0,2019-01-01 01 A,2019-01-01,1,0,2019,1,1,1
1,2019-01-01 02 A,2019-01-01,2,0,2019,1,1,1
2,2019-01-01 03 A,2019-01-01,3,0,2019,1,1,1
3,2019-01-01 04 A,2019-01-01,4,0,2019,1,1,1
4,2019-01-01 05 A,2019-01-01,5,0,2019,1,1,1
...,...,...,...,...,...,...,...,...
15115,2019-03-31 20 H,2019-03-31,20,6,2019,3,31,6
15116,2019-03-31 21 H,2019-03-31,21,6,2019,3,31,6
15117,2019-03-31 22 H,2019-03-31,22,6,2019,3,31,6
15118,2019-03-31 23 H,2019-03-31,23,6,2019,3,31,6


In [31]:
test_x = test[features]

In [32]:
preds = model.predict(test_x)

In [34]:
preds

array([ 971.76781924,  985.00435884,  998.24089844, ..., 1214.34773285,
       1227.58427245, 1240.82081205])

In [35]:
submission['공급량'] = preds

In [36]:
submission.to_csv('./data/gas_linear_regression.csv', index=False)