# Prophet Single Model

## Data Description

    - Raw data: Historical Product Demand.csv

    - Input data: Data on 8x augmentation of demand records by selecting 8 representative items

    - Product code: 'Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
                    'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004'
            

    - Size of Data: 116392 rows × 4 columns

    - Features: Date, Product_Code, Product_Category, Order_Demand

    - Period: 2012-01-01 ~ 2017-01-09

---

In [1]:
# DataFrame
import pandas as pd
import numpy as np
import random
import time
from datetime import datetime, date

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
import itertools

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Save the log
import os
import pickle

# Prophet 
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

# Metric
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

Importing plotly failed. Interactive plots will not work.


## Data Explore

In [2]:
df = pd.read_csv('Data\\train.csv')
df = df[(df['store']==1)]

pd.to_datetime(df['date'].max()) - pd.to_datetime(df['date'].min())
df = df.reset_index(drop=True)
df.rename(columns={'date': 'Date', 'item':'Product_Code', 'sales':'Order_Demand'}, inplace=True)
df = df[['Date', 'Product_Code', 'Order_Demand']]
df['Date'] = pd.to_datetime(df['Date'])
df['Product_Code'] = df['Product_Code'].astype('str')

In [4]:
# # Data Loading
# df = pd.read_csv('Data\HPD_0416.csv')
# # convert the string to the datetype
# df['Date'] = pd.to_datetime(df['Date'])
# # 범위 통일
# start_date = pd.to_datetime('2012-01-10')
# end_date = pd.to_datetime('2016-12-21')

# df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# df = df.reset_index(drop=True)

In [7]:
print(df.info())
print('-------------------------')
print("")
print("The Number of unique")
print('-------------------------')
print('Product code:\t', df.Product_Code.nunique())
print('Category:\t', df.Product_Category.nunique())
print('-------------------------')
print("The Product Code:")
print("")
for i, code in enumerate(df['Product_Code'].unique()):
    print(i+1, code)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14464 entries, 0 to 14463
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              14464 non-null  datetime64[ns]
 1   Product_Code      14464 non-null  object        
 2   Product_Category  14464 non-null  object        
 3   Order_Demand      14464 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 452.1+ KB
None
-------------------------

The Number of unique
-------------------------
Product code:	 8
Category:	 5
-------------------------
The Product Code:

1 Product_0025
2 Product_0739
3 Product_0901
4 Product_1154
5 Product_1248
6 Product_1295
7 Product_1378
8 Product_2004


---

### Split the train and test set
- Input
     data: dataframe with dates and Demand data
     
- output
    - train:  whole data - valid, test
    - valid:  2 * time steps
    - test :  2 * time steps

In [3]:
def split_data(df, forecast_period):
    df = df.sort_values('ds')
    
    test_df = df[-forecast_period:]  # 뒤에서 forecast_period 개의 데이터
    valid_df = df[-2*forecast_period:-forecast_period]  # 뒤에서 2*forecast_period 번째부터 forecast_period 번째까지의 데이터
    train_df = df[:-2*forecast_period]  # 나머지 데이터
    
    return train_df, valid_df, test_df

# Prophet

### 파라미터 최적화
    - Random Search

In [4]:
def optimize_prophet(product_df, valid_df, optimize_trials, forecast_period):
    # 파라미터 후보
    param_grid = {  
        'changepoint_prior_scale': [0.01, 0.1, 1, 10],
        'seasonality_prior_scale': [0.01, 0.1, 1, 10, 100],
        'seasonality_mode': ['additive', 'multiplicative'],
        'changepoint_range': [0.8, 0.9, 1.0],
        }
    
    # 파라미터 조합 생성
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
    rmses = []  
    cutoff = pd.to_datetime(valid_df['ds'].max())  

    # 랜덤 샘플링으로 50개의 조합 선택
    selected_params = random.sample(all_params, optimize_trials)
    
    # 파리미터 평가를 위해 validation_data로 cv진행
    for params in selected_params:
        m = Prophet(**params)
        m.add_country_holidays(country_name='US')  # 미국 공휴일 추가
        m.fit(product_df)
        # validation 기간에 대해 예측
        df_cv = cross_validation(m, cutoffs=[cutoff], horizon=str(forecast_period)+' days')  
        # rolling_window: cut-off를 기준으로 몇 일의 성능을 계산할지 결정(1: 전체 기간동안의 성능)
        df_p = performance_metrics(df_cv, rolling_window=1)
        rmses.append(df_p['rmse'].values[0])
        
    best_params = selected_params[np.argmin(rmses)]
    return best_params

In [5]:
def Prophet_single(product_df, optimize_trials, forecast_period):  # optimize_trials: Search 횟수

    train_df, valid_df, test_df = split_data(product_df, forecast_period)
    
    # 파라미터 최적화
    best_params = optimize_prophet(product_df, valid_df, optimize_trials, forecast_period)
    # 최적 파라미터를 사용한 모델 훈련
    best_model = Prophet(**best_params)
    best_model.add_country_holidays(country_name='US')
    best_model.fit(train_df)
    
    # Prophet으로 예측
    periods=(test_df['ds'].max() - train_df['ds'].max()).days
    future = best_model.make_future_dataframe(periods=periods)
    forecast = best_model.predict(future)
    
    #예측결과 저장
    res_df = pd.merge(test_df, forecast[['ds','yhat']], on='ds')
    res_df = res_df[['ds', 'y', 'yhat']]
    res_df.rename(columns={'yhat': 'Pred'}, inplace=True)
    
    # 예측값이 음수인 것은 0으로 대치
    res_df['Pred'] = res_df['Pred'].apply(lambda x: 0 if x < 0 else x)
    
    # 'y'와 'yhat' 열을 정규화
    scaler = MinMaxScaler()
    res_df[['y_norm', 'Pred_norm']] = scaler.fit_transform(res_df[['y', 'Pred']])
    res_df.set_index('ds', inplace=True)
            
    # 모델과 예측값 딕셔너리 반환
    return best_model, res_df

## Plot the result

In [8]:
"""
Plot the actual vs predition and save the figure in the given directory
"""
def actual_pred_plot(product_code, res_df, metric_df, normalize):
    today = date.today()
    save_path = os.path.join("Result", "Single_Prophet_Result", product_code)
    save_name = f'{product_code}_all_result'
    
    title = f"Pred Actual Plot - {product_code}"
    actual = res_df['y']
    pred = res_df['Pred']
    
    if normalize: 
        title += "(Normalized)"
        actual = res_df['y_norm']
        pred = res_df['Pred_norm']
        save_name += "_normalized"
    # Plot   
    plt.figure(figsize=(16, 8))
    plt.title(title, fontsize=20)
    plt.xlabel("Time", fontsize=14)
    plt.ylabel("Order Demand", fontsize=14)
    plt.plot(actual, label ='Actual', marker='o', ms=3)
    plt.plot(pred, label='Prediction', marker='o', ms=3)
    plt.legend(loc="upper right")
        
    # Plot 결과 저장
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    # save the figure
    today_date = f'_{today.month:02d}{today.day:02d}'
    plt.savefig(os.path.join(save_path, save_name+'.png'))
    #Metric도 함께 저장
    metric_df.to_csv(os.path.join(save_path, save_name+'.csv'))
        
    plt.close('all') # close all figures to free up memory

## Save & Load the model

In [9]:
def save_model(product_code, best_model):
    today = date.today()
    folder_path = 'Result/Single_Prophet_Result/Model'
    file_name = f'{product_code}_{today.month:02d}{today.day:02d}.pkl'
    save_path = os.path.join(folder_path, file_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    # 객체를 pickle 파일로 저장
    with open(save_path, 'wb') as f:
        pickle.dump(best_model, f)
    return best_model

In [10]:
# 학습된 모델 pickle파일에서 불러오기
def load_model(file_name):
    file_path = f'Result/Single_Prophet_Result/Model/{file_name}'
    
    with open(file_path, 'rb') as file:
        model_dict= pickle.load(file)
    
    return best_model

## Metrics

In [11]:
# Model Metric
def mase(training_series, testing_series, prediction_series):
    n = training_series.shape[0]
    d = np.abs(np.diff(training_series)).sum() / (n-1)
    
    errors = np.abs(testing_series - prediction_series)
    return errors.mean() / d

def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual+1)))

# 정규화 된 지표
def nrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    target_mean = np.mean(y_true)
    nrmse = mse / target_mean
    return nrmse

# 정규화 된 지표
def nmae(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    target_mean = np.mean(y_true)
    nmae = mae / target_mean
    return nmae

In [12]:
def calculate_metrics(product_code, res_df, normalize):
    # 정규화 옵션이 True인 경우 정규화된 데이터 사용, 그렇지 않으면 원래 데이터 사용
    if normalize:
        actual = res_df['y_norm']
        pred = res_df['Pred_norm']
    else:
        actual = res_df['y']
        pred = res_df['Pred']

    # 메트릭 계산
    # MASE = mase(np.array(train_series), np.array(actual), pred) 
    MAPE = mape(actual, pred) 
    RMSE = mean_squared_error(actual, pred)**0.5 
    MAE = mean_absolute_error(actual,pred) 
    NRMSE = nrmse(actual,pred) 
    NMAE = nmae(actual,pred) 
    R2 = r2_score(actual, pred)
    # RMSLE = mean_squared_log_error(actual, pred)**0.5 

    # 계산된 메트릭을 데이터프레임에 추가
    metric_df = pd.DataFrame({'MAPE':[round(MAPE, 4)],
                           'RMSE':[round(RMSE, 4)],
                           'MAE':[round(MAE, 4)],
                           'NRMSE':[round(NRMSE, 4)],
                           'NMAE':[round(NMAE, 4)],
                           'R2':[round(R2, 4)]},
                            index= [product_code])

    return metric_df

---

## Main function

In [13]:
def execute_single_Prophet(product_code, optimize_trials=30, forecast_period=90): # 최적화 횟수
    start_time = time.time()
    
    product_code = product_code # 예측하고자 하는 코드 입력
    product_df = df[df['Product_Code']== product_code].reset_index(drop=True)
    product_df = product_df[['Date', 'Order_Demand']]
    product_df.rename(columns={'Date': 'ds', 'Order_Demand': 'y'}, inplace=True)
    # Prophet 단일 모델
    best_model, res_df = Prophet_single(product_df, optimize_trials, forecast_period)
    save_model(product_code, best_model)
    # 모델 Metric과 Pred_Actual Plot 저장
    metric_df_norm = calculate_metrics(product_code, res_df, True)
    metric_df= calculate_metrics(product_code, res_df, False)
    
    actual_pred_plot(product_code, res_df, metric_df_norm, True)
    actual_pred_plot(product_code, res_df, metric_df, False)
    
    # 실행시간 확인
    elapsed_time_seconds = time.time() - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print("실행 시간: {:.2f} 분".format(elapsed_time_minutes))
    return metric_df

---

## Whole Process
    - product_code에 str으로 예측하고자 하는 코드를 입력
    - ['Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
       'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']

In [15]:
codes = ['1', '2', '3', '4', '5']

In [16]:
# codes = ['Product_0025', 'Product_0739', 'Product_0901', 'Product_1154',
#          'Product_1248', 'Product_1295', 'Product_1378', 'Product_2004']

In [17]:
for code in codes:
    print("==================================")
    print(f"========== { code } ==========")
    print("==================================")
    execute_single_Prophet(code)



21:45:28 - cmdstanpy - INFO - Chain [1] start processing
21:45:29 - cmdstanpy - INFO - Chain [1] done processing


  0%|          | 0/1 [00:00<?, ?it/s]

21:45:29 - cmdstanpy - INFO - Chain [1] start processing
21:45:29 - cmdstanpy - INFO - Chain [1] done processing
21:45:30 - cmdstanpy - INFO - Chain [1] start processing
21:45:31 - cmdstanpy - INFO - Chain [1] done processing


  0%|          | 0/1 [00:00<?, ?it/s]

21:45:31 - cmdstanpy - INFO - Chain [1] start processing
21:45:32 - cmdstanpy - INFO - Chain [1] done processing
21:45:32 - cmdstanpy - INFO - Chain [1] start processing
21:45:33 - cmdstanpy - INFO - Chain [1] done processing


  0%|          | 0/1 [00:00<?, ?it/s]

21:45:34 - cmdstanpy - INFO - Chain [1] start processing
21:45:34 - cmdstanpy - INFO - Chain [1] done processing


KeyboardInterrupt: 