# EEMD + LSTM

## Data Description

Input file: Historical Product Demand.csv

Description: CSV data file containing product demand for encoded product id's

Size of Data: (1048575, 5)

Features: Product_Code, Warehouse, Product_Category, Date, Order_Demand

Period: 2012-01-01 ~ 2017-01-09


---

In [1]:
# DataFrame
import pandas as pd
import numpy as np
import random
from datetime import datetime, date

# Preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

# Save the log
import os

# EEMD
from PyEMD import EEMD

# LSTM
import tensorflow as tf

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MSE

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Metric 
# Metric
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

## Data Explore

In [2]:
# Data Loading
df = pd.read_csv('HPD_Augmented_0416.csv')
# convert the string to the datetype
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df

Unnamed: 0,Date,Product_Code,Product_Category,Order_Demand
0,2012-01-05 00:00:00,Product_0025,Category_005,1600.000000
1,2012-01-05 03:00:00,Product_0025,Category_005,1633.403702
2,2012-01-05 06:00:00,Product_0025,Category_005,1628.665789
3,2012-01-05 09:00:00,Product_0025,Category_005,1587.586651
4,2012-01-05 12:00:00,Product_0025,Category_005,1513.949924
...,...,...,...,...
116387,2016-12-26 12:00:00,Product_2004,Category_005,1810.945746
116388,2016-12-26 15:00:00,Product_2004,Category_005,1626.979543
116389,2016-12-26 18:00:00,Product_2004,Category_005,1420.229634
116390,2016-12-26 21:00:00,Product_2004,Category_005,1206.795489


In [3]:
print(df.info())
print('-------------------------')
print("")
print("The Number of unique")
print('-------------------------')
print('Product code:\t', df.Product_Code.nunique())
print('Category:\t', df.Product_Category.nunique())
print('-------------------------')
print("The Product Code:")
print("")
for i, code in enumerate(df['Product_Code'].unique()):
    print(i+1, code)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116392 entries, 0 to 116391
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Date              116392 non-null  datetime64[ns]
 1   Product_Code      116392 non-null  object        
 2   Product_Category  116392 non-null  object        
 3   Order_Demand      116392 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 3.6+ MB
None
-------------------------

The Number of unique
-------------------------
Product code:	 8
Category:	 5
-------------------------
The Product Code:

1 Product_0025
2 Product_0739
3 Product_0901
4 Product_1154
5 Product_1248
6 Product_1295
7 Product_1378
8 Product_2004


---

## EEMD
    * 시계열 그래프를 ensembled IMF (앙상블 내재모드 함수)로 분해
    * n 개의 eIMFs와  1개의 Residual 생성

In [4]:
# 수요 그래프를 n개의 앙상블된 내재모드함수(IMF)로 분해
# 그래프의 변동성이 클수록, IMF의 개수 증가
def eemd_fit(df, trials=100, max_imf=-1):
    
    # Define signal
    t = np.array(df['Date']) # 날짜
    s = np.array(df['Order_Demand']) # 수요량
    
    # EEMD 객체 생성
    eemd = EEMD(trials=trials) # trials: EMD 횟수
    
    # 극값을 감지하는 방법으로 parabolic 방법을 선택
    emd = eemd.EMD
    emd.extrema_detection="parabol"
    
    # eIMFs로 분해
    eIMFs = eemd.eemd(s, t, max_imf=max_imf) # max_imf: IMF 제한 개수(-1: 없음)
    nIMFs = eIMFs.shape[0] # eIMF의 개수
    
    # 분해된 eIMFs와 잔차를 변수에 할당
    imfs, residue = eemd.get_imfs_and_residue()
    
    # 앙상블 IMFs 들의 DataFrame 생성
    all_eIMFs_df = pd.DataFrame(eIMFs).transpose()
    all_eIMFs_df[nIMFs] = residue # residue 열 마지막 열로 추가
    #all_eIMFs_df.set_index(df['Date'], inplace=True) # 날짜를 index로 setting
    all_eIMFs_df.insert(0, 'Date', df['Date']) # Date 열 추가
    
    return all_eIMFs_df, nIMFs # eIMF+Residue들로 이루어진 df, eIMF(Residue포함)의 개수

### eIMFs 데이터프레임 추출

In [5]:
# eIMF들을 추출하여, Date와 y로 이루어진 데이터프레임 추출하고 딕셔너리에 저장
def extract_eIMFs(all_eIMFs_df, nIMFs):
    all_eIMFs_dict = {}
    # IMF개수+Residue(1) 만큼 반복
    for i in range(nIMFs+1):
        tmp_df = all_eIMFs_df[['Date', i]] # n번째 eIMF에 해당하는 날짜와 값 추출
        tmp_df.columns=['Date', 'y'] # i -> y 로 열이름 변경
        all_eIMFs_dict[f'eIMFs_{i}'] = tmp_df # n번째 eIMF 정보(마지막은 Residue) 딕셔너리에 저장
        
    return all_eIMFs_dict # {eIMFs_1: df1, eIMFs_2: df2, ...}

### Split the train and test set
- Input
     data: dataframe with dates and Demand data
     
- output
    - train:  2012-01-01 ~ 2015-06/30 
    - Valid:  2015-07-01 ~ 2015-12-31
    - test :  2016-01-01 ~ 2017-01-06 
    
     
- time_steps: # of the input time steps 
- for_periods: # of the output time steps 

In [6]:
def ts_train_val_test(eIMF_df, time_steps): 

    ts_train_end = len(eIMF_df[eIMF_df['Date']<'2015-07-01']) # train 데이터 종료 인덱스
    ts_val_end = len(eIMF_df[eIMF_df['Date']<'2016-01-01']) # validation 데이터 종료 인덱스
    ts = eIMF_df.filter(['y']).values # y(수요량) 값
    
    # Minmax로 0~1 사이에 값이 오도록 정규화
    sc = MinMaxScaler() # 객체 생성
    ts_scaled = sc.fit_transform(ts) # 전체 y값 정규화
    
    # Train Data
    ts_train_scaled = ts_scaled[:ts_train_end,:]

    X_train = [] 
    y_train = []
    for i in range(time_steps, ts_train_end): 
        X_train.append(ts_train_scaled[i-time_steps:i,0]) # time steps 만큼 sliding window
        y_train.append(ts_train_scaled[i,0])

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    # Reshape X_train for LSTM -> (batch_size, time_steps, features)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1],1))

    # Validation Data
    ts_val_scaled = ts_scaled[ts_train_end : ts_val_end, :]

    X_val = []
    y_val = []
    for i in range(time_steps, len(ts_val_scaled)):
        X_val.append(ts_val_scaled[i-time_steps : i, 0])
        y_val.append(ts_val_scaled[i, 0])

    X_val = np.array(X_val)
    y_val = np.array(y_val)

    # Reshape X_val for LSTM -> (batch_size, time_steps, features)
    X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1],1))
    
    # Test Data
    ts_test_scaled = ts_scaled[ts_val_end:,:]

    X_test = []
    y_test = eIMF_df.iloc[ts_val_end+time_steps:,:]
    y_test.loc[:, 'y_norm'] = ts_test_scaled[time_steps:].reshape(-1).copy()

    for i in range(time_steps, len(ts_test_scaled)):
        X_test.append(ts_test_scaled[i-time_steps : i, 0])
    
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1],1))
    
    return X_train, y_train, X_val, y_val, X_test, y_test, sc

### LSTM

In [7]:
def LSTM_model(X_train, y_train, X_val, y_val, X_test, sc, epochs=10):
    # LSTM 모델 객체 생성
    my_LSTM_model = Sequential() 
    
    # 첫 번째 LSTM 레이어 구성
    # 활성화 함수는 ReLU를 사용하며, return_sequences=True로 지정하여 다음 LSTM 레이어의 입력으로 사용할 수 있도록 함
    my_LSTM_model.add(LSTM(512, activation='relu',return_sequences=True, input_shape=(X_train.shape[1],1)))
    
    # 두 번째 LSTM 레이어 구성
    # 활성화 함수는 ReLU를 사용하며, return_sequences=False로 지정하여 마지막 LSTM 레이어임을 나타냄
    my_LSTM_model.add(LSTM(256, activation = 'relu',return_sequences=False))
    
    # Fully connected 레이어들 추가
    # 마지막 레이어에서는 출력의 unit 개수를 1로 설정하여 1개의 값을 출력
    my_LSTM_model.add(Dense(128))
    my_LSTM_model.add(Dense(64))
    my_LSTM_model.add(Dense(32))
    my_LSTM_model.add(Dense(1))
    
    # 모델 컴파일
    my_LSTM_model.compile(optimizer = "Adam", # Adam optimizer 사용
                         loss = 'mean_squared_error', # 손실 함수로는 평균 제곱 오차 사용
                          metrics=['mape','mae']) # 성능 지표로는 MAPE와 MAE를 사용
    #조기종료 조건
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # 모델 Fitting
    my_LSTM_model.fit(X_train, # 입력 데이터
                      y_train, # 출력 데이터
                      epochs = epochs, # epoch 수
                      batch_size = 16, # batch size
                      validation_data=(X_val, y_val),
                      callbacks=[early_stopping],# validation에 따른 조기종료
                      verbose = 1) # 학습 상태를 출력
    
    # Test 데이터 예측
    LSTM_prediction = my_LSTM_model.predict(X_test) # 예측값 얻기
    LSTM_prediction_normalized = LSTM_prediction # 예측값을 저장하되, normalize된 값 저장
    LSTM_prediction = sc.inverse_transform(LSTM_prediction) # denormalize된 예측값 저장
    
    # 모델 객체와 예측값 반환
    return my_LSTM_model, LSTM_prediction, LSTM_prediction_normalized

### EEMD+LSTM

In [8]:
def EEMD_LSTM(all_eIMFs_dict, time_steps, epochs):

    model_dict = {}
    pred_dict = {}
    
    # 모든 eIMF에 대해 LSTM 모델 학습 및 예측 실행
    for i in all_eIMFs_dict.keys():
        print(f'--------Total: 0~{len(all_eIMFs_dict)-1} eIMFs, Now: {i} --------')
        
        # 현재 eIMF 데이터 가져오기
        eIMF_df = all_eIMFs_dict[i]
        
        # 학습 데이터와 테스트 데이터 분리
        X_train, y_train, X_val, y_val, X_test, y_test, sc = ts_train_val_test(eIMF_df, time_steps)
        
        # LSTM 모델 학습 및 저장
        my_LSTM_model, LSTM_prediction, LSTM_prediction_normalized = LSTM_model(X_train, y_train, X_val, y_val, X_test, sc, epochs)
        model_dict[i] = my_LSTM_model # 딕셔너리에 모델 정보 저장
        
        # 예측 결과 저장
        y_test.reset_index(drop=True, inplace=True)
        pred_df = pd.DataFrame({'Pred': LSTM_prediction.reshape(-1) ,'Pred_norm': LSTM_prediction_normalized.reshape(-1)})
        res_df = pd.concat([y_test, pred_df], axis=1)
        res_df.set_index('Date', inplace=True)
        res_df = res_df.resample('D').first() # 증강된 데이터가 아닌, Actual값들과 비교
        pred_dict[i] = res_df
        
    # 모델과 예측값 딕셔너리 반환
    return model_dict, pred_dict

## Plot the result

In [9]:
def actual_pred_plot(product_code, pred_dict, all_result_df, metric_df, normalize=False):
    today = date.today()
    """
    Plot the actual vs predition and save the figure in the given directory
    """
    pred_dict['all_result'] = all_result_df
    
    save_path = os.path.join("Result", "EEMD+LSTM_Result", product_code+f'_{today.month:02d}{today.day:02d}')
    if normalize: save_path += "_normalized"
        
    for i, pred_df in enumerate(pred_dict.values()):
        img_n = len(pred_dict)
        title = f"Pred Actual Plot - ({i+1}/{len(pred_dict)-1})'s eIMF"
        actual = pred_df['y']
        pred = pred_df['Pred']
        save_name = f'{product_code}_eIMF_{i+1}'
        if i == img_n-1: # All result
            title = f"{product_code}-All Result"
            save_name = f'{product_code}_all_result'
        if normalize:
            title += "(Normalized)"
            actual = pred_df['y_norm']
            pred = pred_df['Pred_norm']
            
        plt.figure(figsize=(16, 8))
        plt.title(title, fontsize=20)
        plt.xlabel("Time", fontsize=14)
        plt.ylabel("Order Demand", fontsize=14)
        plt.plot(actual, label ='Actual', alpha=0.6)
        plt.plot(pred, label='Prediction', alpha=0.8)
        plt.legend(loc="upper right")
        
        # Plot 결과 저장
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        # save the figure
        today_date = f'_{today.month:02d}{today.day:02d}'
        plt.savefig(os.path.join(save_path, save_name+'.png'))
    metric_df.to_csv(os.path.join(save_path, f'{product_code}_Metric.csv'))
    del pred_dict['all_result']
        
    plt.close('all') # close all figures to free up memory

## Metrics

In [10]:
# Model Metric
def mase(training_series, testing_series, prediction_series):
    n = training_series.shape[0]
    d = np.abs(np.diff(training_series)).sum() / (n-1)
    
    errors = np.abs(testing_series - prediction_series)
    return errors.mean() / d

# Model Metric
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+1)))

def nrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    target_mean = np.mean(y_true)
    nrmse = mse / target_mean
    return nrmse

In [12]:
def calculate_metrics(pred_df, normalize):
    # 계산된 메트릭을 저장하기 위해 데이터프레임 초기화
    metric_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'MAE', 'NRMSE'])

    # 정규화 옵션이 True인 경우 정규화된 데이터 사용, 그렇지 않으면 원래 데이터 사용
    if normalize:
        actual = pred_df['y_norm']
        pred = pred_df['Pred_norm']
    else:
        actual = pred_df['y']
        pred = pred_df['Pred']

    # 메트릭 계산
    # MASE = mase(np.array(train_series), np.array(actual), pred) 
    MAPE = mape(actual, pred) 
    RMSE = mean_squared_error(actual, pred)**0.5 
    MAE = mean_absolute_error(actual,pred) 
    NRMSE = nrmse(actual,pred) 
    # RMSLE = mean_squared_log_error(actual, pred)**0.5 

    # 계산된 메트릭을 데이터프레임에 추가
    tmp_df = pd.DataFrame({'MAPE':[round(MAPE, 4)],
                           'RMSE':[round(RMSE, 4)],
                           'MAE':[round(MAE, 4)],
                           'NRMSE':[round(NRMSE, 4)]})

    # 메트릭 데이터프레임에 결과 추가
    metric_df = pd.concat([metric_df, tmp_df])
    return metric_df

---

## Check the Result

In [13]:
def make_metric_df(product_code, pred_dict, all_result_df, normalize):
    today = date.today()

    metric_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'MAE', 'NRMSE'])
    for i, pred_df in pred_dict.items():
        imf_df = calculate_metrics(pred_df, normalize=normalize)
        metric_df = pd.concat([metric_df, imf_df])
    
    imf_idx = pd.Index(['eIMF_'+str(i+1) for i in range(len(pred_dict))]) # changed result_dict to pred_dict
    metric_df.index = imf_idx # Assign the created index to metric_df
    metric_df = pd.concat([metric_df, calculate_metrics(all_result_df, normalize=normalize)], axis=0)
    metric_df = metric_df.rename(index={metric_df.index[-1]: 'All'}) # 마지막 행은 all
    
    return metric_df

In [14]:
def make_all_result_df(pred_dict):
    all_df = pd.DataFrame()
    for tmp_df in pred_dict.values():
        all_df = pd.concat([all_df, tmp_df], axis=1)
    pred_df = all_df['Pred'].sum(axis=1)
    actual_df = all_df['y'].sum(axis=1)
    
    all_result_df = pd.DataFrame({'Pred': pred_df, 'y': actual_df})
    all_result_df.loc[all_result_df['Pred']<0, 'Pred']=0 # 음수 예측 값은 0으로 대치
    
    # 날짜(Date) 열은 정규화하지 않으므로 제외
    result_norm = all_result_df[['Pred', 'y']]
    
    # MinMaxScaler를 이용하여 정규화합니다.
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(result_norm)
    
    # 정규화된 데이터를 데이터 프레임에 반영합니다.
    all_result_df['Pred_norm'] = normalized_data[:,0]
    all_result_df['y_norm'] = normalized_data[:,1]
    return all_result_df

In [15]:
def execute_EEMD_LSTM(product_code, eemd_trials=100, time_steps=30, epochs=20):

    product_code = product_code # 예측하고자 하는 코드 입력
    product_df = df[df['Product_Code']== product_code].reset_index(drop=True)
    
    # EEMD 수행
    all_eIMFs_df, nIMFs = eemd_fit(product_df, eemd_trials)
    # EEMD 결과에서 각 eIMFs' DF 추출
    all_eIMFs_dict = extract_eIMFs(all_eIMFs_df, nIMFs)
    # EEMD+LSTM 실행
    model_dict, pred_dict = EEMD_LSTM(all_eIMFs_dict, time_steps, epochs) #dictionary, time_steps, epochs
    all_result_df = make_all_result_df(pred_dict)
    
    metric_df_norm = make_metric_df(product_code, pred_dict, all_result_df, True)
    metric_df = make_metric_df(product_code, pred_dict, all_result_df, False)
    
    actual_pred_plot(product_code, pred_dict, all_result_df, metric_df_norm, True)
    actual_pred_plot(product_code, pred_dict, all_result_df, metric_df, False)
    
    return metric_df

---

## Whole Process
    - product_code에 str으로 예측하고자 하는 코드를 입력
    - ['Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
       'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']

In [16]:
product_code = 'Product_2004' # 예측하고자 하는 코드 입력
product_df = df[df['Product_Code']== product_code].reset_index(drop=True)

# EEMD 수행
all_eIMFs_df, nIMFs = eemd_fit(product_df, 1)
# EEMD 결과에서 각 eIMFs' DF 추출
all_eIMFs_dict = extract_eIMFs(all_eIMFs_df, nIMFs)
# EEMD+LSTM 실행
model_dict, pred_dict = EEMD_LSTM(all_eIMFs_dict, 1, 1) #dictionary, time_steps, epochs
all_result_df = make_all_result_df(pred_dict)

metric_df_norm = make_metric_df(product_code, pred_dict, all_result_df, True)
metric_df = make_metric_df(product_code, pred_dict, all_result_df, False)

actual_pred_plot(product_code, pred_dict, all_result_df, metric_df_norm, True)
actual_pred_plot(product_code, pred_dict, all_result_df, metric_df, False)

--------Total: 0~13 eIMFs, Now: eIMFs_0 --------
--------Total: 0~13 eIMFs, Now: eIMFs_1 --------
--------Total: 0~13 eIMFs, Now: eIMFs_2 --------
--------Total: 0~13 eIMFs, Now: eIMFs_3 --------
--------Total: 0~13 eIMFs, Now: eIMFs_4 --------
--------Total: 0~13 eIMFs, Now: eIMFs_5 --------
--------Total: 0~13 eIMFs, Now: eIMFs_6 --------
--------Total: 0~13 eIMFs, Now: eIMFs_7 --------
--------Total: 0~13 eIMFs, Now: eIMFs_8 --------
--------Total: 0~13 eIMFs, Now: eIMFs_9 --------
--------Total: 0~13 eIMFs, Now: eIMFs_10 --------
--------Total: 0~13 eIMFs, Now: eIMFs_11 --------
--------Total: 0~13 eIMFs, Now: eIMFs_12 --------
--------Total: 0~13 eIMFs, Now: eIMFs_13 --------


In [21]:
metric_df

Unnamed: 0,MAPE,RMSE,MAE,NRMSE
eIMF_1,1.323,471.3171,394.0095,-16.0708
eIMF_2,3.4228,350.2988,285.0902,-10.0685
eIMF_3,1.4678,155.0003,124.725,-47.6181
eIMF_4,2.441,144.03,112.8649,-13.8879
eIMF_5,0.7851,108.7938,85.6479,-7.337
eIMF_6,1.1568,50.591,41.025,-4.5935
eIMF_7,0.7098,18.5522,13.5753,0.7228
eIMF_8,0.1305,8.0692,6.6548,-0.6157
eIMF_9,0.147,2.1782,1.7501,-0.1266
eIMF_10,0.0256,0.2309,0.1874,-0.3101


In [27]:
pred_dict['eIMFs_0']

Unnamed: 0_level_0,y,y_norm,Pred,Pred_norm
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01,445.506295,0.625223,122.989777,0.527447
2016-01-02,-325.590043,0.391453,148.028610,0.535038
2016-01-03,307.398305,0.583353,226.449707,0.558812
2016-01-04,-689.248128,0.281205,-165.192429,0.440080
2016-01-05,305.847365,0.582883,233.936157,0.561082
...,...,...,...,...
2016-12-23,-481.474600,0.344195,-60.335110,0.471869
2016-12-24,320.786076,0.587412,143.890594,0.533783
2016-12-25,218.254394,0.556328,247.288605,0.565130
2016-12-26,-574.245618,0.316070,-68.714745,0.469329


In [30]:
actual = pred_dict['eIMFs_0']['y']
pred = pred_dict['eIMFs_0']['Pred']
print(mape(actual,pred))
print(np.sqrt(mean_squared_error(actual,pred)))
print(mean_absolute_error(actual,pred))
print(nrmse(actual,pred))

1.323025421026068
471.3171351373042
394.0094603046367
-16.070803094901144


In [44]:
import time
start_time = time.time()

execute_EEMD_LSTM('Product_1378')

elapsed_time_seconds = time.time() - start_time
elapsed_time_minutes = elapsed_time_seconds / 60
print("실행 시간: {:.2f} 분".format(elapsed_time_minutes))

--------Total: 0~12 eIMFs, Now: eIMFs_0 --------


Exception ignored in: <function UniquePtr.__del__ at 0x00000236F55519D0>
Traceback (most recent call last):
  File "C:\Users\7info\anaconda3\envs\Capstone\lib\site-packages\tensorflow\python\framework\c_api_util.py", line 74, in __del__
    self.deleter(obj)
KeyboardInterrupt: 


125/637 [====>.........................] - ETA: 10s - loss: 0.0310 - mape: 36.6704 - mae: 0.1441

KeyboardInterrupt: 