In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [42]:
oil_cpi = pd.read_csv('data/merge_oil_cpi/merge_oil_cpi_daily.csv')
price = pd.read_csv('data/price/price_baechoo.csv')
transaction = pd.read_csv('data/transaction_supply/transaction_supply_baechoo.csv')

oil_cpi.rename(columns={oil_cpi.columns[0] : "거래년월일"}, inplace=True)
price.rename(columns={price.columns[0] : "거래년월일"}, inplace=True)

merge = pd.merge(oil_cpi, price, on='거래년월일', how='outer')
merge = pd.merge(merge, transaction, on='거래년월일', how='outer')
# merge 의 2022-12월 데이터 삭제
merge = merge[merge['거래년월일'] < '2022-12-01']

merge = merge.fillna(method='ffill')
merge = merge.fillna(method='bfill')

merge.set_index("거래년월일",inplace=True)

In [43]:
merge

Unnamed: 0_level_0,자동차용 경유 가격 (원),전월비(%),평균,거래량
거래년월일,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01,583.35,4.0,5600.0,102285.0
2000-01-02,583.35,4.0,5600.0,102285.0
2000-01-03,583.35,4.0,5600.0,102285.0
2000-01-04,583.35,4.0,5600.0,715338.0
2000-01-05,583.35,4.0,5620.0,340568.0
...,...,...,...,...
2022-11-26,1879.15,-8.0,6100.0,296250.0
2022-11-27,1879.15,-8.0,6100.0,296250.0
2022-11-28,1879.15,-8.0,5664.0,201478.0
2022-11-29,1879.15,-8.0,5592.0,77130.0


In [54]:
price= pd.read_csv('data/price/price_baechoo.csv')
price["구분"] = pd.to_datetime(price["구분"],format="%Y-%m-%d")
price_train= price[price["구분"] < '2022-12-01']
price_train

Unnamed: 0,구분,평균
0,2000-01-03,5600.0
1,2000-01-04,5600.0
2,2000-01-05,5620.0
3,2000-01-06,5700.0
4,2000-01-07,5670.0
...,...,...
5877,2022-11-24,6100.0
5878,2022-11-25,6100.0
5879,2022-11-28,5664.0
5880,2022-11-29,5592.0


In [60]:
price_test= price[price["구분"] >= '2022-12-01']
price_test

Unnamed: 0,구분,평균
47,2029-02-01,7430.0
2252,2029-02-01,4110.0
3254,2029-02-01,6600.0
4219,2029-02-01,9400.0
5882,2022-12-01,5462.0
5883,2022-12-05,5982.0
5884,2022-12-06,5502.0
5885,2022-12-07,5488.0
5886,2022-12-08,5408.0
5887,2022-12-09,5408.0


In [59]:
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm

model= ARIMA(price_train['평균'], order=(1,2,0))
model_fit= model.fit(trend='nc', full_output= True, disp=True)
print(model_fit.summary())

NotImplementedError: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been removed in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained. It also offers alternative specialized
parameter estimators.


In [26]:
feature_cols= ['자동차용 경유 가격 (원)', '전월비(%)', '거래량']
label_col= ['평균']

X= merge[feature_cols]
y= merge[label_col]

scaler= MinMaxScaler()
merge[feature_cols]= scaler.fit_transform(X)
merge


Unnamed: 0_level_0,자동차용 경유 가격 (원),전월비(%),평균,거래량
거래년월일,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01,0.022324,0.540453,5600.0,0.028243
2000-01-02,0.022324,0.540453,5600.0,0.028243
2000-01-03,0.022324,0.540453,5600.0,0.028243
2000-01-04,0.022324,0.540453,5600.0,0.197620
2000-01-05,0.022324,0.540453,5620.0,0.094077
...,...,...,...,...
2022-11-26,0.863720,0.152104,6100.0,0.081833
2022-11-27,0.863720,0.152104,6100.0,0.081833
2022-11-28,0.863720,0.152104,5664.0,0.055649
2022-11-29,0.863720,0.152104,5592.0,0.021293


In [27]:
TEST_SIZE = 200
WINDOW_SIZE = 20

train = merge[:-TEST_SIZE]
test = merge[-TEST_SIZE:]

In [30]:
def make_dataset(data, label, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

In [31]:
from sklearn.model_selection import train_test_split

train_feature= train[feature_cols]
train_label= train[label_col]

train_feature, train_label = make_dataset(train_feature, train_label, 20)

x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)
x_train.shape, x_valid.shape

((6520, 20, 3), (1630, 20, 3))

In [33]:
test_feature = test[feature_cols]
test_label = test[label_col]

test_feature.shape, test_label.shape

((200, 3), (200, 1))

In [34]:
test_feature, test_label = make_dataset(test_feature, test_label, 20)
test_feature.shape, test_label.shape

((180, 20, 3), (180, 1))

In [35]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM

model = Sequential()
model.add(LSTM(16, 
               input_shape=(train_feature.shape[1], train_feature.shape[2]), 
               activation='relu', 
               return_sequences=False)
          )

model.add(Dense(1))

In [37]:
! pip install h5py



In [40]:
import os

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit

Epoch 1/200
Epoch 00001: val_loss improved from inf to 11675166.00000, saving model to model\tmp_checkpoint.h5


ImportError: `save_model()` using h5 format requires h5py. Could not import h5py.

In [10]:
from sklearn.model_selection import train_test_split

feature_cols= ['자동차용 경유 가격 (원)', '전월비(%)', '거래량']
label_col= ['평균']

X= df[feature_cols]
y= df[label_col]

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)

In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM

In [13]:
model = Sequential()
model.add(LSTM(16, 
               input_shape=(X_train.shape[1], 1), 
               activation='relu', 
               return_sequences=False)
          )

model.add(Dense(1))

In [16]:
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=50, batch_size=150, verbose=0)

<keras.callbacks.History at 0x1c6c050d910>

In [17]:
pred= model.predict(X_test)