# ARIMA Single Model

## Data Description

    - Raw data: Historical Product Demand.csv

    - Input data: Data on 8x augmentation of demand records by selecting 8 representative items

    - Product code: 'Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
                    'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004'
            

    - Size of Data: 116392 rows × 4 columns

    - Features: Date, Product_Code, Product_Category, Order_Demand

    - Period: 2012-01-01 ~ 2017-01-09
    

---

In [112]:
# DataFrame
import pandas as pd
import numpy as np
import random
from datetime import datetime, date

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')

# Save the log
import os
import pickle
import time
from datetime import timedelta
# EEMD
from PyEMD import EEMD

# ARIMA
from pmdarima.arima import auto_arima

# Metric 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

## Data Explore

In [2]:
# Data Loading
df = pd.read_csv('Data\HPD_Augmented_0416.csv')
# convert the string to the datetype
df['Date'] = pd.to_datetime(df['Date'])
df

Unnamed: 0,Date,Product_Code,Product_Category,Order_Demand
0,2012-01-05 00:00:00,Product_0025,Category_005,1600.000000
1,2012-01-05 03:00:00,Product_0025,Category_005,1633.403702
2,2012-01-05 06:00:00,Product_0025,Category_005,1628.665789
3,2012-01-05 09:00:00,Product_0025,Category_005,1587.586651
4,2012-01-05 12:00:00,Product_0025,Category_005,1513.949924
...,...,...,...,...
116387,2016-12-26 12:00:00,Product_2004,Category_005,1810.945746
116388,2016-12-26 15:00:00,Product_2004,Category_005,1626.979543
116389,2016-12-26 18:00:00,Product_2004,Category_005,1420.229634
116390,2016-12-26 21:00:00,Product_2004,Category_005,1206.795489


In [68]:
df = pd.read_csv('Data\\train.csv')
df = df[(df['store']==1)]

pd.to_datetime(df['date'].max()) - pd.to_datetime(df['date'].min())
df = df.reset_index(drop=True)
df.rename(columns={'date': 'Date', 'item':'Product_Code', 'sales':'Order_Demand'}, inplace=True)
df = df[['Date', 'Product_Code', 'Order_Demand']]
df['Date'] = pd.to_datetime(df['Date'])
df['Product_Code'] = df['Product_Code'].astype('str')

In [17]:
print(df.info())
print('-------------------------')
print("")
print("The Number of unique")
print('-------------------------')
print('Product code:\t', df.Product_Code.nunique())
print('Category:\t', df.Product_Category.nunique())
print('-------------------------')
print("The Product Code:")
print("")
for i, code in enumerate(df['Product_Code'].unique()):
    print(i+1, code)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91300 entries, 0 to 91299
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          91300 non-null  datetime64[ns]
 1   Product_Code  91300 non-null  object        
 2   Order_Demand  91300 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 2.1+ MB
None
-------------------------

The Number of unique
-------------------------
Product code:	 50


AttributeError: 'DataFrame' object has no attribute 'Product_Category'

---

## Split the train and test set
- Input
    - data: dataframe with dates and Demand data
     
- output
    - train:  2012-01-01 ~ 2015-12/31 
    - test :  2016-01-01 ~ 2017-01-06 

In [103]:
# train과 test로 데이터 split
def split_data(df):
    train_size = int(len(df) * 0.9)
    train_df = df[:train_size].copy()
    test_df = df[train_size:].copy()
    return train_df, test_df

## ARIMA

In [19]:
# '''
# auto_arima로 (p,d,q)의 최적값을 찾고,
# best_model로 예측 후 Result DataFrame을 Return
# '''
# def ARIMA_single(product_df):
#     product_df.rename(columns={'Order_Demand': 'y'}, inplace=True)
#     product_df = product_df[['Date', 'y']]

#     train_df, test_df = split_data(product_df)
#     # Search the propper (p,d,q)
#     best_model = auto_arima(train_df['y'], 
#                             start_p=0, start_q=0,
#                             max_p=5, max_q=5, 
#                             max_d=2, trace=True,
#                             suppress_warnings=True)
    
#     best_model_fit = best_model.fit(train_df['y'])
    
#     predictions = best_model_fit.predict(n_periods=len(test_df))
#     res_df = test_df.copy()
#     res_df['Pred'] = predictions
    
#     # 'y'와 'Pred' 열을 정규화
#     scaler = MinMaxScaler()
#     res_df[['y_norm', 'Pred_norm']] = scaler.fit_transform(res_df[['y', 'Pred']])
#     res_df.set_index('Date', inplace=True)
#     # 원 데이터와 비교
#     res_df = res_df.resample('D').first()
#     # 음수인 예측값 0으로 대치
#     res_df.loc[res_df['Pred']<0, 'Pred']=0
#     # res_df: ['y'','Pred','y_norm','Pred_norm'] index='Date'
#     return best_model, res_df

In [104]:
# Rolling Forecasting
def ARIMA_single(product_df):
    product_df.rename(columns={'Order_Demand': 'y'}, inplace=True)
    product_df = product_df[['Date', 'y']]
    
    train_df, test_df = split_data(product_df)

    predictions = []
    best_model = auto_arima(train_df['y'], 
                            start_p=0, start_q=0,
                            max_p=5, max_q=5, 
                            max_d=2, trace=True,
                            suppress_warnings=True)
    
    for test_date in test_df['Date']:
        test_date -= timedelta(days=1)
        # Use data up to the day before the test date to train the model
        train_until_test_df = product_df[product_df['Date'] < test_date]
    
        best_model_fit = best_model.fit(train_until_test_df['y'])
    
        # Predict the next day's value and add it to the predictions list
        prediction = best_model_fit.predict(n_periods=1).reset_index(drop=True)[0]
        predictions.append(max(0, prediction))  # replace negative predictions with 0

    # Create a DataFrame to hold the result
    res_df = test_df.copy()
    res_df['Pred'] = predictions
    # 'y'와 'Pred' 열을 정규화
    scaler = MinMaxScaler()
    res_df[['y_norm', 'Pred_norm']] = scaler.fit_transform(res_df[['y', 'Pred']])
    res_df.set_index('Date', inplace=True)
    # 음수인 예측값 0으로 대치
    res_df.loc[res_df['Pred']<0, 'Pred']=0
    # res_df: ['y'','Pred','y_norm','Pred_norm'] index='Date'
    return best_model, res_df

## Save & Load the model

In [105]:
def save_model(product_code, best_model):
    today = date.today()
    folder_path = 'Result/Single_ARIMA_Result/Model'
    file_name = f'{product_code}_{today.month:02d}{today.day:02d}.pkl'
    save_path = os.path.join(folder_path, file_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    # 객체를 pickle 파일로 저장
    with open(save_path, 'wb') as f:
        pickle.dump(best_model, f)
    return best_model

In [106]:
# 학습된 모델 pickle파일에서 불러오기
def load_model(file_name):
    file_path = f'Result/Single_ARIMA_Result/Model/{file_name}'
    
    with open(file_path, 'rb') as file:
        model_dict= pickle.load(file)
    
    return best_model

## Plot the result

In [107]:
def actual_pred_plot(product_code, res_df, metric_df, normalize):
    today = date.today()
    """
    Plot the actual vs predition and save the figure in the given directory
    """
    
    save_path = os.path.join("Result", "Single_ARIMA_Result", product_code)
    save_name = f'{product_code}_all_result'
    
    title = f"Pred Actual Plot - {product_code}"
    actual = res_df['y']
    pred = res_df['Pred']
    # 정규화 된 경우 actual, pred 값 달라짐
    if normalize: 
        title += "(Normalized)"
        actual = res_df['y_norm']
        pred = res_df['Pred_norm']
        save_name += "_normalized"
    # Plot   
    plt.figure(figsize=(16, 8))
    plt.title(title, fontsize=20)
    plt.xlabel("Time", fontsize=14)
    plt.ylabel("Order Demand", fontsize=14)
    plt.plot(actual, label ='Actual', marker='o', ms=3)
    plt.plot(pred, label='Prediction', marker='o', ms=3)
    plt.legend(loc="upper right")
        
    # Plot 결과 저장
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    # save the figure
    today_date = f'_{today.month:02d}{today.day:02d}'
    plt.savefig(os.path.join(save_path, save_name+'.png'))
    # 전체 결과에 대한 Metric도 함께 저장
    metric_df.to_csv(os.path.join(save_path, save_name+'.csv'))
        
    plt.close('all') # close all figures to free up memory

## Metrics

In [108]:
# Model Metric
def mase(training_series, testing_series, prediction_series):
    n = training_series.shape[0]
    d = np.abs(np.diff(training_series)).sum() / (n-1)
    
    errors = np.abs(testing_series - prediction_series)
    return errors.mean() / d

def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+1)))

# 정규화 된 지표
def nrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    target_mean = np.mean(y_true)
    nrmse = mse / target_mean
    return nrmse

# 정규화 된 지표
def nmae(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    target_mean = np.mean(y_true)
    nmae = mae / target_mean
    return nmae

In [109]:
def calculate_metrics(product_code, res_df, normalize):
    # 정규화 옵션이 True인 경우 정규화된 데이터 사용, 그렇지 않으면 원래 데이터 사용
    if normalize:
        actual = res_df['y_norm']
        pred = res_df['Pred_norm']
    else:
        actual = res_df['y']
        pred = res_df['Pred']

    # 메트릭 계산
    # MASE = mase(np.array(train_series), np.array(actual), pred) 
    MAPE = mape(actual, pred) 
    RMSE = mean_squared_error(actual, pred)**0.5 
    MAE = mean_absolute_error(actual,pred) 
    NRMSE = nrmse(actual,pred) 
    NMAE = nmae(actual,pred) 
    R2 = r2_score(actual, pred)
    # RMSLE = mean_squared_log_error(actual, pred)**0.5 

    # 계산된 메트릭을 데이터프레임에 추가
    metric_df = pd.DataFrame({'MAPE':[round(MAPE, 4)],
                           'RMSE':[round(RMSE, 4)],
                           'MAE':[round(MAE, 4)],
                           'NRMSE':[round(NRMSE, 4)],
                           'NMAE':[round(NMAE, 4)],
                           'R2': [round(R2, 4)]},
                            index= [product_code])

    return metric_df

---

## Check the Result

In [110]:
def execute_single_ARIMA(product_code):
    start_time = time.time()
    
    product_code = product_code # 예측하고자 하는 코드 입력
    product_df = df[df['Product_Code']== product_code].reset_index(drop=True)

    # Prophet 단일 모델
    best_model, res_df = ARIMA_single(product_df) 
    save_model(product_code, best_model)
    # 모델 Metric과 Pred_Actual Plot 저장
    metric_df_norm = calculate_metrics(product_code, res_df, True)
    metric_df= calculate_metrics(product_code, res_df, False)
    
    actual_pred_plot(product_code, res_df, metric_df_norm, True)
    actual_pred_plot(product_code, res_df, metric_df, False)
    
    # 실행시간 확인
    elapsed_time_seconds = time.time() - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print("실행 시간: {:.2f} 분".format(elapsed_time_minutes))
    return metric_df

---

In [111]:
execute_single_ARIMA('4')

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=11187.998, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=10773.097, Time=0.09 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=10201.960, Time=0.14 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=11186.006, Time=0.02 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=10203.658, Time=0.24 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=10203.634, Time=0.27 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=10196.266, Time=0.79 sec
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=10198.261, Time=1.08 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=10198.261, Time=0.89 sec
 ARIMA(0,1,3)(0,0,0)[0] intercept   : AIC=10203.125, Time=0.34 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=10203.492, Time=0.32 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=10183.479, Time=1.49 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=10183.955, Time=1.88 sec
 ARIMA(2,1,4)(0,0,0)[0] intercept   : AIC=10044.347, Time=2.34 sec
 ARIMA(1,1,4)(0,0,0

Unnamed: 0,MAPE,RMSE,MAE,NRMSE,NMAE,R2
4,0.2336,6.0456,4.9267,0.2642,0.2153,0.1738


## Whole Process
    - product_code에 str으로 예측하고자 하는 코드를 입력
    - ['Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
       'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']

In [99]:
for code in ['Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
             'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']:
    print("==================================")
    print(f"========== { code } ==========")
    print("==================================")
    execute_single_ARIMA(code)

실행 시간: 0.03 분
                  MAPE      RMSE       MAE   NRMSE
Product_0025  291.0962  965.0509  641.5268  1.9693
실행 시간: 0.11 분
                MAPE     RMSE      MAE   NRMSE
Product_0739  0.2932  46.9636  19.2149  2.4441
실행 시간: 0.49 분
                 MAPE     RMSE      MAE   NRMSE
Product_0901  12.5931  89.6329  45.8805  2.2913
실행 시간: 0.02 분
                  MAPE       RMSE        MAE   NRMSE
Product_1154  944.7914  2806.3135  1845.0757  2.2603
실행 시간: 0.02 분
                    MAPE         RMSE          MAE   NRMSE
Product_1248  55738.2977  233395.9957  167441.9966  1.5898
실행 시간: 0.58 분
                   MAPE        RMSE        MAE   NRMSE
Product_1295  4108.0797  68438.9556  44018.159  0.9371
실행 시간: 0.11 분
                    MAPE        RMSE         MAE   NRMSE
Product_1378  16514.9994  49624.6385  40610.7466  1.0371
실행 시간: 0.02 분
                  MAPE       RMSE       MAE   NRMSE
Product_2004  514.0343  1319.0387  978.7391  1.8652


In [None]:
#load_model('Product_0739_0503.pkl')