# EEMD + LSTM

## Data Description

Input file: Historical Product Demand.csv

Description: CSV data file containing product demand for encoded product id's

Size of Data: (1048575, 5)

Features: Product_Code, Warehouse, Product_Category, Date, Order_Demand

Period: 2012-01-01 ~ 2017-01-09


---

In [198]:
# DataFrame
import pandas as pd
import numpy as np
import random
from datetime import datetime, date

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')

# Save the log
import os
import pickle
import time
# EEMD
from PyEMD import EEMD

# ARIMA
from pmdarima.arima import auto_arima

# Metric 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

## Data Explore

In [199]:
# Data Loading
df = pd.read_csv('HPD_0416.csv')
# convert the string to the datetype
df['Date'] = pd.to_datetime(df['Date'])
df

Unnamed: 0,Date,Product_Code,Product_Category,Order_Demand
0,2012-01-05,Product_0025,Category_005,1600.0
1,2012-01-06,Product_0025,Category_005,1000.0
2,2012-01-07,Product_0025,Category_005,0.0
3,2012-01-08,Product_0025,Category_005,0.0
4,2012-01-09,Product_0025,Category_005,0.0
...,...,...,...,...
14551,2016-12-23,Product_2004,Category_005,0.0
14552,2016-12-24,Product_2004,Category_005,0.0
14553,2016-12-25,Product_2004,Category_005,0.0
14554,2016-12-26,Product_2004,Category_005,2000.0


In [200]:
print(df.info())
print('-------------------------')
print("")
print("The Number of unique")
print('-------------------------')
print('Product code:\t', df.Product_Code.nunique())
print('Category:\t', df.Product_Category.nunique())
print('-------------------------')
print("The Product Code:")
print("")
for i, code in enumerate(df['Product_Code'].unique()):
    print(i+1, code)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14556 entries, 0 to 14555
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              14556 non-null  datetime64[ns]
 1   Product_Code      14556 non-null  object        
 2   Product_Category  14556 non-null  object        
 3   Order_Demand      14556 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 455.0+ KB
None
-------------------------

The Number of unique
-------------------------
Product code:	 8
Category:	 5
-------------------------
The Product Code:

1 Product_0025
2 Product_0739
3 Product_0901
4 Product_1154
5 Product_1248
6 Product_1295
7 Product_1378
8 Product_2004


### Split the train and test set
- Input
     data: dataframe with dates and Demand data
     
- output
    - train:  2012-01-01 ~ 2015-12/31 
    - test :  2016-01-01 ~ 2017-01-06 

In [201]:
# train과 test로 데이터 split
def split_data(df):
    df['Date'] = pd.to_datetime(df['Date']).copy()
    
    train_df = df[(df['Date'] <'2016-01-01')].sort_values('Date', ascending=True)
    test_df = df[(df['Date'] >= '2016-01-01')].sort_values('Date', ascending=True) 
    
    return train_df, test_df

---

## EEMD
    * 시계열 그래프를 ensembled IMF (앙상블 내재모드 함수)로 분해
    * n 개의 eIMFs와  1개의 Residual 생성

In [202]:
# 수요 그래프를 n개의 앙상블된 내재모드함수(IMF)로 분해
# 그래프의 변동성이 클수록, IMF의 개수 증가
def eemd_fit(df, trials, max_imf=-1):
    
    # Define signal
    t = np.array(df['Date']) # 날짜
    s = np.array(df['y']) # 수요량
    
    # EEMD 객체 생성
    eemd = EEMD(trials=trials) # trials: EMD 횟수
    
    # 극값을 감지하는 방법으로 parabolic 방법을 선택
    emd = eemd.EMD
    emd.extrema_detection="parabol"
    
    # eIMFs로 분해
    eIMFs = eemd.eemd(s, t, max_imf=max_imf) # max_imf: IMF 제한 개수(-1: 없음)
    nIMFs = eIMFs.shape[0] # eIMF의 개수
    
    # 분해된 eIMFs와 잔차를 변수에 할당
    imfs, residue = eemd.get_imfs_and_residue()
    
    # 앙상블 IMFs 들의 DataFrame 생성
    all_eIMFs_df = pd.DataFrame(eIMFs).transpose()
    all_eIMFs_df[nIMFs] = residue # residue 열 마지막 열로 추가
    #all_eIMFs_df.set_index(df['Date'], inplace=True) # 날짜를 index로 setting
    all_eIMFs_df.insert(0, 'Date', df['Date']) # Date 열 추가
    
    return all_eIMFs_df, nIMFs # eIMF+Residue들로 이루어진 df, eIMF(Residue포함)의 개수

### eIMFs 데이터프레임 추출

In [203]:
# eIMF들을 추출하여, Date와 y로 이루어진 데이터프레임 추출하고 딕셔너리에 저장
def extract_eIMFs(all_eIMFs_df, nIMFs):
    all_eIMFs_dict = {}
    # IMF개수+Residue(1) 만큼 반복
    for i in range(nIMFs+1):
        tmp_df = all_eIMFs_df[['Date', i]] # n번째 eIMF에 해당하는 날짜와 값 추출
        tmp_df.columns=['Date', 'y'] # i -> y 로 열이름 변경
        all_eIMFs_dict[f'eIMFs_{i}'] = tmp_df # n번째 eIMF 정보(마지막은 Residue) 딕셔너리에 저장
        
    return all_eIMFs_dict # {eIMFs_1: df1, eIMFs_2: df2, ...}

In [204]:
def EEMD_ARIMA(all_eIMFs_dict):
    model_dict = {}
    pred_dict = {}
    
    for i in all_eIMFs_dict.keys():
        print(f'--------Total: 0~{len(all_eIMFs_dict)-1} eIMFs, Now: {i} --------')
        
        eIMF_df = all_eIMFs_dict[i]
        train_df, test_df = split_data(eIMF_df)

        best_model = auto_arima(train_df['y'], 
                                start_p=0, start_q=0,
                                max_p=5, max_q=5, 
                                max_d=2, 
                                trace = False,
                                suppress_warnings=True)
        
        best_model_fit = best_model.fit(train_df['y'])
        
        model_dict[i] = best_model_fit
        
        predictions = best_model_fit.predict(n_periods=len(test_df))
        res_df = test_df.copy()
        res_df['Pred'] = predictions
        
        # 'y'와 'Pred' 열을 정규화
        scaler = MinMaxScaler()
        
        res_df[['y_norm', 'Pred_norm']] = scaler.fit_transform(res_df[['y', 'Pred']])
        res_df.set_index('Date', inplace=True)
        pred_dict[i] = res_df
        
    return model_dict, pred_dict

## Save and Load the model 

In [205]:
def save_model(product_code, model_dict):
    today = date.today()
    folder_path = 'Result/EEMD+ARIMA_Result/Model'
    file_name = f'{product_code}_{today.month:02d}{today.day:02d}.pkl'
    save_path = os.path.join(folder_path, file_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    # 객체를 pickle 파일로 저장
    with open(save_path, 'wb') as f:
        pickle.dump(model_dict, f)
    return model_dict

In [206]:
def load_model(file_name):
    file_path = f'Result/EEMD+ARIMA_Result/Model/{file_name}'
    
    with open(file_path, 'rb') as file:
        model_dict= pickle.load(file)
    
    return model_dict

## Plot the result

In [207]:
def actual_pred_plot(product_code, pred_dict, all_result_df, metric_df, normalize=False):
    today = date.today()
    """
    Plot the actual vs predition and save the figure in the given directory
    """
    pred_dict['all_result'] = all_result_df
    
    save_path = os.path.join("Result", "EEMD+ARIMA_Result", product_code+f'_{today.month:02d}{today.day:02d}')
    if normalize: save_path += "_normalized"
        
    for i, pred_df in enumerate(pred_dict.values()):
        img_n = len(pred_dict)
        title = f"Pred Actual Plot - ({i+1}/{len(pred_dict)-1})'s eIMF"
        actual = pred_df['y']
        pred = pred_df['Pred']
        save_name = f'{product_code}_eIMF_{i+1}'
        if i == img_n-1: # All result
            title = f"{product_code}-All Result"
            save_name = f'{product_code}_all_result'
        if normalize:
            title += "(Normalized)"
            actual = pred_df['y_norm']
            pred = pred_df['Pred_norm']
            
        plt.figure(figsize=(16, 8))
        plt.title(title, fontsize=20)
        plt.xlabel("Time", fontsize=14)
        plt.ylabel("Order Demand", fontsize=14)
        plt.plot(actual, label ='Actual', alpha=0.6)
        plt.plot(pred, label='Prediction', alpha=0.8)
        plt.legend(loc="upper right")
        
        # Plot 결과 저장
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        # save the figure
        today_date = f'_{today.month:02d}{today.day:02d}'
        plt.savefig(os.path.join(save_path, save_name+'.png'))
    metric_df.to_csv(os.path.join(save_path, f'{product_code}_Metric.csv'))
    del pred_dict['all_result']
        
    plt.close('all') # close all figures to free up memory

## Metrics

In [208]:
# Model Metric
def mase(training_series, testing_series, prediction_series):
    n = training_series.shape[0]
    d = np.abs(np.diff(training_series)).sum() / (n-1)
    
    errors = np.abs(testing_series - prediction_series)
    return errors.mean() / d

# Model Metric
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+1)))

def nrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    target_mean = np.mean(y_true)
    nrmse = mse / target_mean
    return nrmse

In [209]:
def calculate_metrics(pred_df, normalize):
    # 계산된 메트릭을 저장하기 위해 데이터프레임 초기화
    metric_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'MAE', 'NRMSE'])

    # 정규화 옵션이 True인 경우 정규화된 데이터 사용, 그렇지 않으면 원래 데이터 사용
    if normalize:
        actual = pred_df['y_norm']
        pred = pred_df['Pred_norm']
    else:
        actual = pred_df['y']
        pred = pred_df['Pred']

    # 메트릭 계산
    # MASE = mase(np.array(train_series), np.array(actual), pred) 
    MAPE = mape(actual, pred) 
    RMSE = mean_squared_error(actual, pred)**0.5 
    MAE = mean_absolute_error(actual,pred) 
    NRMSE = nrmse(actual,pred) 
    # RMSLE = mean_squared_log_error(actual, pred)**0.5 

    # 계산된 메트릭을 데이터프레임에 추가
    tmp_df = pd.DataFrame({'MAPE':[round(MAPE, 4)],
                           'RMSE':[round(RMSE, 4)],
                           'MAE':[round(MAE, 4)],
                           'NRMSE':[round(NRMSE, 4)]})

    # 메트릭 데이터프레임에 결과 추가
    metric_df = pd.concat([metric_df, tmp_df])
    return metric_df

---

## Check the Result

In [210]:
def make_metric_df(product_code, pred_dict, all_result_df, normalize):
    today = date.today()

    metric_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'MAE', 'NRMSE'])
    for i, pred_df in pred_dict.items():
        imf_df = calculate_metrics(pred_df, normalize=normalize)
        metric_df = pd.concat([metric_df, imf_df])
    
    imf_idx = pd.Index(['eIMF_'+str(i+1) for i in range(len(pred_dict))]) # changed result_dict to pred_dict
    metric_df.index = imf_idx # Assign the created index to metric_df
    metric_df = pd.concat([metric_df, calculate_metrics(all_result_df, normalize=normalize)], axis=0)
    metric_df = metric_df.rename(index={metric_df.index[-1]: 'All'}) # 마지막 행은 all
    
    return metric_df

In [211]:
def make_all_result_df(pred_dict):
    all_df = pd.DataFrame()
    for tmp_df in pred_dict.values():
        all_df = pd.concat([all_df, tmp_df], axis=1)
    pred_df = all_df['Pred'].sum(axis=1)
    actual_df = all_df['y'].sum(axis=1)
    
    all_result_df = pd.DataFrame({'Pred': pred_df, 'y': actual_df})
    all_result_df.loc[all_result_df['Pred']<0, 'Pred']=0 # 음수 예측 값은 0으로 대치
    
    # 날짜(Date) 열은 정규화하지 않으므로 제외
    result_norm = all_result_df[['Pred', 'y']]
    
    # MinMaxScaler를 이용하여 정규화합니다.
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(result_norm)
    
    # 정규화된 데이터를 데이터 프레임에 반영합니다.
    all_result_df['Pred_norm'] = normalized_data[:,0]
    all_result_df['y_norm'] = normalized_data[:,1]
    return all_result_df

In [212]:
def execute_EEMD_ARIMA(product_code, eemd_trials=100):
    start_time = time.time()

    product_code = product_code # 예측하고자 하는 코드 입력
    product_df = df[df['Product_Code']== product_code].reset_index(drop=True)
    product_df.rename(columns={'Order_Demand': 'y'}, inplace=True)
    # EEMD 수행
    all_eIMFs_df, nIMFs = eemd_fit(product_df, eemd_trials)
    # EEMD 결과에서 각 eIMFs' DF 추출
    all_eIMFs_dict = extract_eIMFs(all_eIMFs_df, nIMFs)
    
    # EEMD+ARIMA 실행
    model_dict, pred_dict = EEMD_ARIMA(all_eIMFs_dict) #dictionary, time_steps, epochs
    all_result_df = make_all_result_df(pred_dict)
    # 모델 저장
    save_model(product_code, model_dict)
    metric_df_norm = make_metric_df(product_code, pred_dict, all_result_df, True)
    metric_df = make_metric_df(product_code, pred_dict, all_result_df, False)
    
    actual_pred_plot(product_code, pred_dict, all_result_df, metric_df_norm, True)
    actual_pred_plot(product_code, pred_dict, all_result_df, metric_df, False)
    
    elapsed_time_seconds = time.time() - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print("실행 시간: {:.2f} 분".format(elapsed_time_minutes))
    
    return metric_df

---

## Whole Process
    - product_code에 str으로 예측하고자 하는 코드를 입력
    - ['Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
       'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']

In [213]:
for code in ['Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']:
    print("==================================")
    print(f"========== { code } ==========")
    print("==================================")
    execute_EEMD_ARIMA(code, eemd_trials=100)

--------Total: 0~10 eIMFs, Now: eIMFs_0 --------
--------Total: 0~10 eIMFs, Now: eIMFs_1 --------
--------Total: 0~10 eIMFs, Now: eIMFs_2 --------
--------Total: 0~10 eIMFs, Now: eIMFs_3 --------
--------Total: 0~10 eIMFs, Now: eIMFs_4 --------
--------Total: 0~10 eIMFs, Now: eIMFs_5 --------
--------Total: 0~10 eIMFs, Now: eIMFs_6 --------
--------Total: 0~10 eIMFs, Now: eIMFs_7 --------
--------Total: 0~10 eIMFs, Now: eIMFs_8 --------
--------Total: 0~10 eIMFs, Now: eIMFs_9 --------
--------Total: 0~10 eIMFs, Now: eIMFs_10 --------
실행 시간: 1.78 분
--------Total: 0~10 eIMFs, Now: eIMFs_0 --------
--------Total: 0~10 eIMFs, Now: eIMFs_1 --------
--------Total: 0~10 eIMFs, Now: eIMFs_2 --------
--------Total: 0~10 eIMFs, Now: eIMFs_3 --------
--------Total: 0~10 eIMFs, Now: eIMFs_4 --------
--------Total: 0~10 eIMFs, Now: eIMFs_5 --------
--------Total: 0~10 eIMFs, Now: eIMFs_6 --------
--------Total: 0~10 eIMFs, Now: eIMFs_7 --------
--------Total: 0~10 eIMFs, Now: eIMFs_8 --------
-----

### Load model
    - 추후, 모델 결과를 다시 확인 할 일 있을 때, Model 파일 안에 있는 pickle 파일 로드

In [180]:
load_model('Product_0739_0503.pkl')

{'eIMFs_0': ARIMA(order=(0, 0, 2), scoring_args={}, seasonal_order=(2, 0, 0, 3),
 'eIMFs_1': ARIMA(order=(2, 0, 2), scoring_args={}, seasonal_order=(2, 0, 0, 3),
 'eIMFs_2': ARIMA(order=(2, 0, 2), scoring_args={}, seasonal_order=(2, 0, 1, 3),
 'eIMFs_3': ARIMA(order=(2, 0, 2), scoring_args={}, seasonal_order=(2, 0, 1, 3),
 'eIMFs_4': ARIMA(order=(2, 0, 2), scoring_args={}, seasonal_order=(1, 0, 1, 3),
 'eIMFs_5': ARIMA(order=(0, 0, 0), scoring_args={}, seasonal_order=(0, 0, 0, 3),
 'eIMFs_6': ARIMA(order=(0, 2, 0), scoring_args={}, seasonal_order=(2, 0, 1, 3),
 'eIMFs_7': ARIMA(order=(1, 2, 1), scoring_args={}, seasonal_order=(0, 0, 0, 3),
 'eIMFs_8': ARIMA(order=(0, 2, 0), scoring_args={}, seasonal_order=(0, 0, 0, 3),
 'eIMFs_9': ARIMA(order=(0, 0, 0), scoring_args={}, seasonal_order=(1, 0, 1, 3),