# EEMD + LSTM

In [None]:
# DataFrame
import pandas as pd
import numpy as np
import random
from datetime import date
import matplotlib.dates as mdates
import matplotlib

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Visualization
import matplotlib.pyplot as plt
import warnings

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

#Save the log
import os
import time
import pickle
import tempfile

# EEMD
from PyEMD import EEMD

# LSTM
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential

# Optimize
from keras_tuner.tuners import RandomSearch

# Metric 
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
# set the seed
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

In [None]:
# Minus
matplotlib.rcParams['axes.unicode_minus'] = False
# font
plt.rcParams['font.family'] = 'Serif'

In [None]:
def eemd_fit(df):
    # Define signal
    t = np.array(df['Date']) # x-axis
    s = np.array(df['y']) # y-axis

    eemd = EEMD()
    eemd.noise_seed(1234)
    
    eIMFs = eemd.eemd(s, t, max_imf=-1) 
    nIMFs = eIMFs.shape[0] 

    imfs, residue = eemd.get_imfs_and_residue()

    all_eIMFs_df = pd.DataFrame(eIMFs).transpose() 
    all_eIMFs_df[nIMFs] = residue 
    all_eIMFs_df.insert(0, 'Date', df['Date']) 
    
    plt.figure(figsize=(12, nIMFs*2), dpi=300) 
    for i in range(nIMFs):
        plt.subplot(nIMFs+1, 1, i+1) 
        plt.plot(df['Date'], all_eIMFs_df[i], 'g')
        plt.title('IMF '+str(i+1), fontsize=10)

    # Residue plot
    plt.subplot(nIMFs+1, 1, nIMFs+1)
    plt.plot(df['Date'], all_eIMFs_df[nIMFs], 'r')
    plt.title('Residue', fontsize=10)

    plt.tight_layout()
    plt.show()
    
    return all_eIMFs_df, nIMFs # eIMF+Residue

In [None]:
def extract_eIMFs(all_eIMFs_df, nIMFs):
    all_eIMFs_dict = {}

    for i in range(nIMFs+1):
        tmp_df = all_eIMFs_df[['Date', i]] 
        tmp_df.columns=['Date', 'y'] 
        all_eIMFs_dict[f'eIMFs_{i}'] = tmp_df 
                            # df.columns = ['Date', 'y']
    return all_eIMFs_dict # {eIMFs_1: df1, eIMFs_2: df2, ...}

In [None]:
def split_data(product_df, eIMF_df, time_steps): 

    train_end = len(product_df[product_df['Date']<'2022-07-01'])
    
    features = product_df.drop(['Date','Product','년월'], axis=1).columns.tolist()
    
    global n_features
    n_features = len(features)
    
    filtered_df = product_df.filter(features)  
    filtered_df['y'] = eIMF_df['y']
    target_idx = filtered_df.columns.tolist().index('y')
    
    sc = MinMaxScaler() 
    y_train_scaled = sc.fit_transform(filtered_df.iloc[:train_end, :])
    
    X_train = [] 
    y_train = []
    
    for i in range(time_steps, train_end):
        X_train.append(y_train_scaled[i-time_steps:i, :])  
        y_train.append(y_train_scaled[i, target_idx])  
        
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], n_features)) 
    
    y_test_scaled = sc.transform(filtered_df.iloc[train_end:, :])
    
    X_test = []
    y_test = product_df.iloc[train_end+time_steps:].copy()
    y_test['y'] = eIMF_df['y'].iloc[train_end+time_steps:]
    
    y_test['y_norm'] = y_test_scaled[time_steps:, target_idx]  
    
    for i in range(time_steps, len(y_test_scaled)):
        X_test.append(y_test_scaled[i-time_steps:i, :])
    
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], n_features))

    return X_train, y_train, X_test, y_test, sc, target_idx

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units_1', min_value=128, max_value=320, step=64),
                   activation='tanh',
                   return_sequences=True, 
                   input_shape=(None, n_features)))
    
    model.add(LSTM(units=hp.Int('units_2', min_value=64, max_value=256, step=32),
                   activation='tanh',
                   return_sequences=False))

    model.add(Dense(units=hp.Int('dense_unit', min_value=16, max_value=128, step=16),
                    activation='tanh'))
        
    model.add(Dense(1))

    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                  loss='mean_squared_error',
                  metrics=['mse'])

    return model

def optimize_model(X_train, y_train, X_test, sc, epochs, trials, target_idx):
   
    with tempfile.TemporaryDirectory() as temp_dir:
        tuner = RandomSearch(
            build_model,
            objective='loss',
            max_trials= trials,
            directory=temp_dir,
            project_name='temp_project')

    tuner.search_space_summary()
    tuner.search(X_train, y_train,
                 epochs=epochs,
                 batch_size=8)

    tuner.results_summary()

    best_model = tuner.get_best_models(num_models=1)[0]

    pred = best_model.predict(X_test) 
    pred_norm = pred 
    
    pred_expanded = np.zeros((pred.shape[0], n_features))
    pred_expanded[:,target_idx] = pred.ravel()  

    pred = sc.inverse_transform(pred_expanded)
    pred = pred[:, target_idx]  
    
    best_model.summary()
  
    return best_model, pred, pred_norm

In [None]:
def save_model(best_model, product_code, idx):
    path = f'./Result/EEMD+LSTM/Model/{product_code}_{idx}.h5'
    best_model.save(path)
    return 

def use_model(X_test, product_code, idx, target_idx, sc):
    path = f'./Result/EEMD+LSTM/Model/{product_code}_{idx}.h5'
    best_model = tf.keras.models.load_model(path)
    
    pred = best_model.predict(X_test) 
    pred_norm = pred 
    
    pred_expanded = np.zeros((pred.shape[0], n_features))
    pred_expanded[:,target_idx] = pred.ravel()  
    
    pred = sc.inverse_transform(pred_expanded)
    pred = pred[:, target_idx]  
    
    best_model.summary()

    return best_model, pred, pred_norm

In [None]:
def EEMD_LSTM(product_df, all_eIMFs_dict, time_steps, epochs, trials, saved_model: bool):
    residue_epochs = epochs
    pred_dict = {}
    product_code = product_df['Product'].unique()[0]
     
    for idx, i in enumerate(all_eIMFs_dict.keys()):
        print(f'--------Total: 0~{len(all_eIMFs_dict)-1} eIMFs, Now: {i} --------')
        
        eIMF_df = all_eIMFs_dict[i]

        X_train, y_train, X_test, y_test, sc, target_idx = split_data(product_df, eIMF_df, time_steps)
        
        # use the existing model
        if saved_model:
            best_model, pred, pred_norm = use_model(X_test, product_code, idx, target_idx, sc)
        # save the new model
        else:
            best_model, pred, pred_norm = optimize_model(X_train, y_train, X_test, sc, epochs, trials, target_idx)
            save_model(best_model, product_code, idx)
        
        if idx != len(all_eIMFs_dict)-2:
            epochs = max(1, round(epochs * 0.8)) 
      
        else: 
            epochs = residue_epochs
        
        y_test.reset_index(drop=True, inplace=True)
        pred_df = pd.DataFrame({'Pred': pred.reshape(-1) ,'Pred_norm': pred_norm.reshape(-1)})
        res_df = pd.concat([y_test, pred_df], axis=1)
        
        res_df.set_index('Date', inplace=True)
        res_df.index = pd.to_datetime(res_df.index)
        # res_df: ['y', 'y_norm', 'Pred', 'Pred_norm'], index='Date'
        pred_dict[i] = res_df
        
    return pred_dict

In [None]:
def make_all_result_df(pred_dict):
    all_eIMF_df = pd.DataFrame()
    for tmp_df in pred_dict.values():
        all_eIMF_df = pd.concat([all_eIMF_df, tmp_df], axis=1)
        
    pred_df = all_eIMF_df['Pred'].sum(axis=1)
    actual_df = all_eIMF_df['y'].sum(axis=1)
    
    all_result_df = pd.DataFrame({'Pred': pred_df, 'y': actual_df})
    all_result_df.loc[all_result_df['Pred']<0, 'Pred']=0 
    
    return all_result_df

In [None]:
def actual_pred_plot(product_code, pred_dict, all_result_df, metric_df):
    """
    Plot the actual vs predition and save the figure in the given directory
    """
    pred_dict['all_result'] = all_result_df
    
    save_path = os.path.join("Result", "EEMD+LSTM", product_code)
        
    for i, res_df in enumerate(pred_dict.values()):
        img_n = len(pred_dict)
        title = f"eIMF{i+1}"
        if i == img_n-2: title = "Residue"
        actual = res_df['y']
        pred = res_df['Pred']
        save_name = f'{product_code}_eIMF_{i+1}'
        
        if i == img_n-1: # All result
            title = f"{product_code}-All Result"
            save_name = f'{product_code}_all_result'

        # Pred-Actual Plot
        plt.figure(figsize=(16, 8), dpi=300)
        plt.title(title, fontsize=20)
        plt.xlabel("Date", fontsize=14)
        plt.ylabel("Order Demand", fontsize=14)
        plt.plot(res_df.index, actual, label ='Actual', color='r')
        plt.plot(res_df.index, pred, label='Prediction',color='b')

        plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        plt.legend(loc="upper right")
        
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        plt.savefig(os.path.join(save_path, save_name+'.png'), dpi=300)
        plt.show()
        
    metric_df.to_csv(os.path.join(save_path, f'{product_code}_metric.csv'), encoding="utf-8-sig")
    all_result_df.to_csv(os.path.join(save_path, f'{product_code}_total_result.csv'), encoding="utf-8-sig")
    
    del pred_dict['all_result']
    
    file_path = os.path.join(save_path, f'{product_code}_eIMF_dict.pkl')
    with open(file_path, 'wb') as file:
        pickle.dump(pred_dict, file)
        
    plt.close('all') # close all figures to free up memory

In [None]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+1)))

def nrmse(y_true, y_pred):
    mse = root_mean_squared_error(y_true, y_pred)
    target_mean = np.mean(y_true)
    nrmse = mse / target_mean
    return nrmse

def nmae(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    target_mean = np.mean(y_true)
    nmae = mae / target_mean
    return nmae

In [None]:
def calculate_metrics(pred_df):
    metric_df = pd.DataFrame(columns=['MAPE',
                                      'RMSE',
                                      'MAE',
                                      'NRMSE',
                                      'NMAE',
                                      'R2'])
   
    actual = pred_df['y']
    pred = pred_df['Pred']


    MAPE = mape(actual, pred) 
    RMSE = root_mean_squared_error(actual, pred) 
    MAE = mean_absolute_error(actual,pred) 
    NRMSE = nrmse(actual,pred) 
    NMAE = nmae(actual,pred) 
    R2 = r2_score(actual, pred)

    tmp_df = pd.DataFrame({'MAPE':[round(MAPE, 4)],
                           'RMSE':[round(RMSE, 4)],
                           'MAE':[round(MAE, 4)],
                           'NRMSE':[round(NRMSE, 4)],
                           'NMAE':[round(NMAE, 4)],
                           'R2': [round(R2, 4)]})

    metric_df = pd.concat([metric_df, tmp_df])
    return metric_df

In [None]:
def make_metric_df(pred_dict, all_result_df):

    metric_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'MAE', 'NRMSE', 'NMAE', 'R2'])
    for i, pred_df in pred_dict.items():
        imf_df = calculate_metrics(pred_df)
        metric_df = pd.concat([metric_df, imf_df])
    
    imf_idx = pd.Index(['eIMF_'+str(i+1) for i in range(len(pred_dict))]) # changed result_dict to pred_dict
    metric_df.index = imf_idx # Assign the created index to metric_df
    
    metric_df = pd.concat([metric_df, calculate_metrics(all_result_df)], axis=0)
    metric_df = metric_df.rename(index={metric_df.index[-1]: 'All'}) 
    
    return metric_df

In [None]:
def execute_EEMD_LSTM(product_code, time_steps=30, epochs=50, optimize_trials=5):
    start_time = time.time()
    product_df = df[df['Product']== product_code].reset_index(drop=True)
    
    all_eIMFs_df, nIMFs = eemd_fit(product_df)
    all_eIMFs_dict = extract_eIMFs(all_eIMFs_df, nIMFs)
    
    pred_dict = EEMD_LSTM(product_df, all_eIMFs_dict, time_steps, epochs, optimize_trials, saved_model=True) #dictionary, time_steps, epochs
    # save_model(product_code, model_dict)
    all_result_df = make_all_result_df(pred_dict)
    
    # Metric 성능 평가
    metric_df = make_metric_df(pred_dict, all_result_df)
    # Pred_Actual Plot
    actual_pred_plot(product_code, pred_dict, all_result_df, metric_df)
    
    # 실행시간 확인
    elapsed_time_seconds = time.time() - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print("실행 시간: {:.2f} 분".format(elapsed_time_minutes))
    return metric_df

---

In [None]:
df = pd.read_csv("../Data/dataset.csv")
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
set_seed(1234)
    
all_metric = pd.DataFrame()
target_code = ['Office Product', 'Packaging material', 'Pharmaceuticals']
for code in target_code:
   
    print("==================================")
    print(f"============ { code } ============")
    print("==================================")

    tmp_metric = execute_EEMD_LSTM(code)
    all_metric = pd.concat([all_metric, tmp_metric])
    
prod_metric_df = all_metric.loc['All']
prod_metric_df.index = target_code