In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Activation
# from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
import datetime

ModuleNotFoundError: No module named 'keras'

#  Data load

In [None]:
target_original = pd.read_csv('data/first_data/target_v1.csv')
weather_original = pd.read_csv('data/first_data/weather_v1.csv')
hourly_smp_original = pd.read_csv('data/first_data/hourly_smp_v1.csv')

target = target_original.copy()
weather = weather_original.copy()
hourly_smp = hourly_smp_original.copy()

In [None]:
oil_price_cl_original = pd.read_csv('data/oil/oil_price_cl.csv')
oil_price_du_original = pd.read_csv('data/oil/oil_price_du.csv')
oil_price_brt_original = pd.read_csv('data/oil/oil_price_brt.csv')

oil_price_cl = oil_price_cl_original.copy()
oil_price_du = oil_price_du_original.copy()
oil_price_brt = oil_price_brt_original.copy()

#  Preprocessing

### smp and oil data

In [None]:
def date_time_split(data) :
    data['date'] = pd.to_datetime(data['date'])

    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['weekday'] = data['date'].dt.weekday
    data['hour'] = data['date'].dt.hour

    data = data.drop(['date'], axis = 1)

    return data

In [None]:
target = date_time_split(target)
target = target.iloc[:, :-1]

In [None]:
def time_data(data, start, end) :
    
    # time data를 만들기 위해 각 데이터별 날짜의 최대 최솟값을 저장해둠
    period = pd.date_range(start = start, end = end)
    
     #time data 생성
    time_data = pd.DataFrame({'date' : period})
    time_data = date_time_split(time_data)
    
    return time_data

In [None]:
def make_oil_data(oil,start, end) :
    #oil_price에서 필요한 column(date, price)만을 추출함 
    oil = oil.iloc[:, 1:3]
    
    # target data의 날짜에 맞춰 sampling
    # oil의 가격은 3개월 이후의 smp에 영향을 미치므로 2017-11-02 이후의 데이터만을 sampling
    oil = oil[(oil['date'] >= start) & (oil['date'] <= end)]
    
    #날짜 순서대로 data를 sorting
    oil.sort_values(by = ['date'], inplace = True)
    
    #index를 0부터 시작되도록 초기화(후에 data를 merge할때 index가 다르면 error 발생)
    oil.reset_index(drop = True, inplace = True)

    oil = date_time_split(oil)

    return oil

In [None]:
def merge_oil(oil_1, oil_2, oil_3, time) :
    
    #time_data를 기준으로 oil data들을 merge함
    # 이때 주말의 oil data들은 존재하지 않으므로 모두 null값으로 들어감
    oil = pd.merge(left = time, right = oil_1, how = 'outer')
    oil = pd.merge(left = oil, right = oil_2, how = 'outer')
    oil = pd.merge(left = oil, right = oil_3, how = 'outer')
    
    #주말 oil data의 null값을 각 column의 평균으로 채움
    fillna = {'du_price' : oil.du_price.mean(), 'cl_price' : oil.cl_price.mean(), 'brt_price' : oil.brt_price.mean()}
    oil = oil.fillna(value = fillna)
    
    #oil 데이터의 날짜정보를 필요없으므로 drop
    oil = oil.iloc[ : , -3: ]
    
    return oil

In [None]:
oil_price_cl.columns = ['place', 'date', 'cl_price']
oil_price_du.columns = ['place', 'date', 'du_price']
oil_price_brt.columns = ['place', 'date', 'brt_price']

start = '2017-11-02'
end = '2019-11-01'

In [None]:
oil_cl = make_oil_data(oil_price_cl, start, end)
oil_du = make_oil_data(oil_price_du, start, end)
oil_brt = make_oil_data(oil_price_brt, start, end)

time = time_data(oil_cl, start, end)

In [None]:
oil = merge_oil(oil_cl, oil_du, oil_brt, time)

In [None]:
target = pd.concat([target, oil], axis = 1)
target

### temperature data

In [None]:
def temp_preprocessing(data) :
    
    #weather data에서 temp와 date를 제외한 모든 column drop
    data = data[data['area'] == 884]
    data = data.iloc[:, 1:3]
    data.reset_index(drop = True, inplace = True)
    data.columns = ['date', 'temp']
    
    #weather data의 date를 year, month, day, weekday, hour로 분해해줌
    data = date_time_split(data)
    
    #2018년 2월 1일 00시 데이터와 2019년 2월 1일 00시 데이터 누락
    #해당 데이터를 채워줌
    column = data.columns
    
    first_data = [round(data[data['month'] ==2]['temp'].mean(), 1) , 2018, 2, 1, 3, 0]
    second_data = [round(data[data['month'] ==2]['temp'].mean(), 1), 2019, 2, 1, 4, 0]
    time_data = [first_data, second_data]
    time_data = pd.DataFrame(time_data, columns = column)
    data = pd.concat([data, time_data])
    
    data.sort_values(by = ['year', 'month', 'day', 'weekday', 'hour'], inplace = True)
    data.reset_index(drop = True, inplace = True)
    
    return data

In [None]:
def make_daily_temp(data, window_size, daily_size) :
    
    #하루치 데이터마다 sampling하여 dict 자료구조로 저장
    weather_dict = {}
    for window in range(daily_size) :
        weather_dict[window] = data.iloc[window_size*window+1 : window_size*(window+1), : ]
        
    
    #동일한 기간의 날짜정보만 담은 dataframe 만들기
    start = '2018-02-01'
    end = '2020-01-31'

    period = pd.date_range(start = start, end = end)
    period = pd.DataFrame({'date' : period})
    period = date_time_split(period)
    period = period.iloc[ : , :-1]
    
    #시간별 온도들을 모아 하루치 평균 온도를 계산
    mean_temp = [0] * daily_size
    for window in range(daily_size) :
        mean_temp[window] = round(weather_dict[window]['temp'].mean(),1)
    
    period['temp'] = mean_temp
    
    return period

In [None]:
daily_size = 730
window_size = 24

weather = temp_preprocessing(weather)
weather = make_daily_temp(data = weather, window_size = window_size, daily_size = daily_size)
weather.head()

### Combine both and make train

In [None]:
train = pd.merge(left = weather, right = target, on = ['year', 'month', 'day', 'weekday'])

time = train['year'].astype(str) + '-' + train['month'].astype(str) + '-' + train['day'].astype(str)
train['date'] = pd.to_datetime(time)

train = train[['date', 'year', 'month', 'day', 'weekday', 'temp', 'cl_price', 'du_price', 'brt_price', 'smp_min', 'smp_max', 'smp_mean', 'supply']]

In [None]:
train

# Create windows 

In [None]:
smp_mean = train['smp_mean'].values

In [None]:
seq_len = 50
sequence_length = seq_len + 1

result = []
for index in range(len(smp_mean) - sequence_length):
    result.append(smp_mean[index: index + sequence_length])

# Normalize data

In [None]:
normalized_data = []
for window in result:
    normalized_window = [((float(p) / float(window[0])) - 1) for p in window]
    normalized_data.append(normalized_window)
    
result = np.array(normalized_data)

# Split data

In [None]:
# split train and test data
row = int(round(result.shape[0] * 0.9))
train = result[:row, :]
np.random.shuffle(train)

x_train = train[:, 1:9]
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
y_train = train[:, 11]

x_test = result[row:, 1:9]
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
y_test = result[row:, 11]

x_train.shape, x_test.shape

# Build model

In [None]:
model = Sequential()

model.add(LSTM(8, return_sequences=True, input_shape=(8, 1)))

model.add(LSTM(64, return_sequences=False))

model.add(Dense(1, activation='linear'))

model.compile(loss='mse', optimizer='rmsprop')

model.summary()
____________________

# Training

In [None]:
model.fit(x_train, y_train,
    validation_data=(x_test, y_test),
    batch_size=10,
    epochs=20)

# Prediction

In [None]:
pred = model.predict(x_test)

fig = plt.figure(facecolor='white', figsize=(20, 10))
ax = fig.add_subplot(111)
ax.plot(y_test, label='True')
ax.plot(pred, label='Prediction')
ax.legend()
plt.show()