In [2]:
import glob
import os
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

In [3]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

In [4]:
# gpu 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [5]:
plt.rcParams["font.family"] = 'NanumGothic'
data = pd.read_csv(os.path.join('Data', 'public_data', 'train.csv'))

In [7]:
# 평가식
def nmae(y_true, y_pred):
    score = np.mean(np.abs(y_true - y_pred) / y_true)
    return score

In [8]:
# 정규화
def normalize(data, col):
    
    # 요일은 원핫 인코딩을 적용함, 요일에 대한 의미가 없을 것이라고 판단
    data = pd.concat([data, pd.get_dummies(data['요일'])], axis = 1)
    data = data.drop(['요일'], axis = 1)

    col1 = data.columns[-7:].to_list()
    col2 = data.columns[1:-7].to_list()

    new_col = ['date'] + col1 + col2
    data = data[new_col]
    data = data.drop(['date'], axis = 1)
    # 0 ~ 1 값으로 정규화 진행
    norm = data.iloc[:,8:].max(0)
    data.iloc[:,8:] = data.iloc[:,8:]/norm
    
    train = data.iloc[:, col:col + 2]
    
    return train, norm

In [9]:
# train & test 분리
def load_data(data, window_size = 28, future_size = 28, train_size = 0.9):
        x = []; y = []
        for i in range(len(data) - window_size - future_size):
            x.append(data.iloc[i: i+window_size])
            y.append(data.iloc[i+window_size:i+window_size+future_size, 1]) # 가격만

        x = np.array(x)
        y = np.array(y)

        train_idx = round(len(x) * train_size)

        train_x = x[:train_idx]
        train_y = y[:train_idx]

        valid_x = x[train_idx:]
        valid_y = y[train_idx:]
        
        return train_x, train_y, valid_x, valid_y

In [67]:
# 모델 구축
def build_model():
    # Build the LSTM model
    # return_sequences 이전 정보를 모두 볼지 안볼지 확인하는 옵션
    model = Sequential()
    model.add(LSTM(28, return_sequences = True, input_shape = (train_x.shape[1], 2)))
    model.add(LSTM(100, return_sequences = False))
    model.add(Dense(50))
    model.add(Dense(28, activation = "linear"))

    model.compile(loss='mse', optimizer='adam')
    return model

In [11]:
window_size = 28
submission = pd.read_csv('Data/sample_submission.csv')
public_date_list = submission[submission['예측대상일자'].str.contains('2020')]['예측대상일자'].str.split('+').str[0].unique()

In [56]:
for n, col in enumerate(list(range(7, 49, 2))):

    data = pd.read_csv('Data/public_data/train.csv')
    train, norm = normalize(data, col)
    
    train_x, train_y, valid_x, valid_y = load_data(train)
    
    
    model = build_model()
    model.fit(train_x, train_y, batch_size=1, epochs=200, validation_data=(valid_x, valid_y), verbose=1, callbacks = [early_stop])
    
    y_pred = model.predict(valid_x)
    y_true = valid_y

    
    target_idx = np.where(y_true != 0)
    y_pred = y_pred[target_idx]
    y_true = y_true[target_idx]
    
    print(train.columns[1], '의 NMAE: ', nmae(y_true, y_pred))
    
    ## 실제 test데이터 생성
    for date in public_date_list:
        test_df = pd.read_csv(f'Data/public_data/test_files/test_{date}.csv')
        data = pd.read_csv('Data/public_data/train.csv')
        data = pd.concat([data, test_df]).iloc[-window_size:]

        test, norm = normalize(data, col)
        sub_output = model.predict(test.to_numpy().reshape(1,28,2)) * norm[n*2]

        idx = submission[submission['예측대상일자'].str.contains(date)].index
        submission.loc[idx, train.columns[1]] = sub_output[0,[6,13,27]]
    print(submission.iloc[:20,n+1])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 00003: early stopping
배추_가격(원/kg) 의 NMAE:  0.35132294555570365
0     390.230957
1     364.403931
2     342.680389
3     390.140594
4     364.481262
5     342.919434
6     390.053345
7     364.523590
8     342.902649
9      67.691467
10     35.171719
11     39.881962
12    306.579102
13    241.803589
14    269.923096
15    217.988983
16    166.067825
17    200.219604
18    270.015747
19    220.488174
Name: 배추_가격(원/kg), dtype: float64
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 00011: early stopping
무_가격(원/kg) 의 NMAE:  0.2069515750714774
0      526.810974
1      747.890686
2      715.576111
3      868.790955
4     1095.748901
5      947.009827
6      779.213928
7      980.929138
8      689.042969
9      810.976135
10    1001.792480
11     685.502197
12     930.614136
13    1094.778198
14     768.722961
15    1064.993286
16    1165.519653
17     776.695

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 00010: early stopping
양배추_가격(원/kg) 의 NMAE:  0.31144444384304054
0     667.464661
1     595.618408
2     521.538025
3     767.465576
4     675.718506
5     582.270020
6     693.049683
7     574.401001
8     523.654785
9     508.424591
10    423.436066
11    433.588898
12    608.155701
13    527.474670
14    532.179382
15    458.848694
16    421.334503
17    467.721588
18    139.583496
19     67.821556
Name: 양배추_가격(원/kg), dtype: float64
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 00010: early stopping
깻잎_가격(원/kg) 의 NMAE:  0.21735890576374264
0     4931.467773
1     4934.265625
2     4993.927246
3     4441.870117
4     4643.827637
5     4502.436035
6     3479.239990
7     3535.990967
8     3516.981934
9     3930.993164
10    4105.191406
11    3953.310059
12    5089.709961
13    5447

Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 00012: early stopping
새송이_가격(원/kg) 의 NMAE:  0.10236282236306561
0     1703.947266
1     1837.264648
2     1950.623047
3     1715.614136
4     1878.565063
5     2017.002808
6     1890.024292
7     1742.839966
8     1798.671753
9     1831.972900
10    1710.244019
11    1778.502563
12    2272.988037
13    2068.642090
14    2047.659424
15    1959.421509
16    2050.801514
17    2021.596436
18    -242.365540
19    -277.491882
Name: 새송이_가격(원/kg), dtype: float64
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 00007: early stopping
팽이버섯_가격(원/kg) 의 NMAE:  0.21617142050533206
0     2079.491699
1     1957.802490
2     1831.388672
3     2122.352051
4     2017.009155
5     1888.947266
6     1738.930176
7     1672.158203
8     1507.063354
9     1561.351074
10    1604.481567
11    1511.586548
12    1250.751709
13    1224.234985
14    1295.129272
15    1242.468994
16    1203.172241

Epoch 00009: early stopping
애호박_가격(원/kg) 의 NMAE:  0.24733178037715617
0      625.835449
1      589.242493
2      557.865295
3      699.518616
4      662.040649
5      724.324341
6      401.927460
7      388.613068
8      503.718597
9      137.729507
10     214.355713
11     219.783966
12     735.790527
13     756.498413
14     643.461914
15    1029.107544
16     806.583984
17     966.665222
18     -20.101923
19    -149.187256
Name: 애호박_가격(원/kg), dtype: float64
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 00012: early stopping
캠벨얼리_가격(원/kg) 의 NMAE:  0.23981474462957775
0     808.666138
1     630.522705
2     440.841919
3     815.783630
4     644.483154
5     447.463379
6     830.111816
7     637.933777
8     427.350708
9     435.823914
10    319.255615
11    206.254288
12    534.227051
13    415.478577
14    350.658875
15    478.512054
16    429.803497
17    358.523926
18    419.8

In [66]:
submission.to_csv('result/02_LSTM.csv', index = False)

In [65]:
submission

Unnamed: 0,예측대상일자,배추_가격(원/kg),무_가격(원/kg),양파_가격(원/kg),건고추_가격(원/kg),마늘_가격(원/kg),대파_가격(원/kg),얼갈이배추_가격(원/kg),양배추_가격(원/kg),깻잎_가격(원/kg),...,당근_가격(원/kg),파프리카_가격(원/kg),새송이_가격(원/kg),팽이버섯_가격(원/kg),토마토_가격(원/kg),청상추_가격(원/kg),백다다기_가격(원/kg),애호박_가격(원/kg),캠벨얼리_가격(원/kg),샤인마스캇_가격(원/kg)
0,2020-09-29+1week,390.230957,526.810974,631.129883,3848.074219,4546.910156,1825.984253,953.710022,667.464661,4931.467773,...,1309.033203,3552.940918,1703.947266,2079.491699,2355.599121,2645.181152,668.601929,625.835449,808.666138,4737.706543
1,2020-09-29+2week,364.403931,747.890686,582.398682,3848.877930,5370.061523,1613.535889,924.436462,595.618408,4934.265625,...,1332.410156,3461.895508,1837.264648,1957.802490,2457.753662,2184.726807,794.578552,589.242493,630.522705,4755.255859
2,2020-09-29+4week,342.680389,715.576111,593.250000,4945.966309,5339.697754,1349.367798,758.278259,521.538025,4993.927246,...,1177.326904,3039.084961,1950.623047,1831.388672,3741.208984,1896.309692,780.351074,557.865295,440.841919,4106.506348
3,2020-09-30+1week,390.140594,868.790955,654.249512,4147.734863,4355.702148,1869.893311,941.699280,767.465576,4441.870117,...,1319.733521,2133.965820,1715.614136,2122.352051,526.637146,2006.473633,755.671631,699.518616,815.783630,4990.385742
4,2020-09-30+2week,364.481262,1095.748901,655.594482,4084.541016,4864.045410,1578.052490,922.081299,675.718506,4643.827637,...,1384.297729,2170.050781,1878.565063,2017.009155,1753.656250,1883.904175,848.561768,662.040649,644.483154,4996.022949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,2021-11-03+2week,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
224,2021-11-03+4week,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
225,2021-11-04+1week,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
226,2021-11-04+2week,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


- LSTM 품목별로 개별 예측해서 실험 결과 0.4507457824 나옴
- 학습에 관여되는 값들이 추석 등이  큰 영향을 줄것으로 판단