In [9]:
import sys

sys.path.append(f"/home/npanj/personal_works/m5-forecasting-accuracy")

from src.utils.import_downcasting import import_downcasting
from src.utils.plotting import PdfFile
import os
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import copy

root = os.path.join(
    "/home/npanj/personal_works/m5-forecasting-accuracy/assets/data/data_CA_1"
)

In [8]:
def get_wmape(data: pd.DataFrame, resid_column: str) -> pd.DataFrame:
    """
    Function to get WMAPE
    """

    data["abs_residual"] = data[resid_column].abs()
    sum_abs_residual = (
        data[["store_id", "item_id", "abs_residual"]]
        .groupby(["store_id", "item_id"])
        .abs_residual.sum()
        .reset_index()
    )
    sum_actual = (
        data[["store_id", "item_id", "sales"]]
        .groupby(["store_id", "item_id"])
        .sales.sum()
        .reset_index()
    )
    sum_abs_residual.rename(columns={"abs_residual": "sum_abs_residual"}, inplace=True)
    wmape_df = pd.merge(
        left=sum_abs_residual, right=sum_actual, on=["store_id", "item_id"], how="left"
    )
    wmape_df["wmape"] = wmape_df["sum_abs_residual"] / wmape_df["sales"]

    return wmape_df


def plotting_sales_forecast(
    data: pd.DataFrame, score: pd.DataFrame, file_name: str, product_cat: str = None
):
    """
    Function to plot prediction and actual
    """

    pd.options.mode.chained_assignment = None
    data_df = data.copy()
    score_df = score.copy()

    if os.path.exists(file_name):
        os.remove(file_name)

    if product_cat is not None:
        data_df = data_df.loc[data_df.cat_id == product_cat]
        score_df = score_df[score_df.item_id.isin(data_df.item_id.tolist())]
        score_df.sort_values(by="wmape", ascending=False, inplace=True)

    item_ids = list(score_df.item_id.unique())

    pdf = PdfFile(file_name)
    for item_id in item_ids:
        df = data_df.loc[data_df.item_id == item_id]
        eval_df = score_df.loc[score_df.item_id == item_id]

        store_id = str(df.store_id.unique()[0])
        item_id = str(df.item_id.unique()[0])
        wmape = str(eval_df.wmape.unique()[0])

        title = f"graph of store_id:{store_id} on item_id:{item_id} with WMAPE:{wmape}"
        pdf.save_fig(df, title)

    pdf.close()


def get_wmape_custom_groupby(data: pd.DataFrame, resid_column: str, groupby_keys: list):
    """
    Function to get WMAPE wigh customize in groupby key
    """

    data["abs_residual"] = data[resid_column].abs()
    residual_columns = copy.deepcopy(groupby_keys)
    residual_columns.append("abs_residual")

    actual_columns = copy.deepcopy(groupby_keys)
    actual_columns.append("sales")

    sum_abs_residual = (
        data[residual_columns].groupby(groupby_keys).abs_residual.sum().reset_index()
    )
    sum_actual = data[actual_columns].groupby(groupby_keys).sales.sum().reset_index()
    sum_abs_residual.rename(columns={"abs_residual": "sum_abs_residual"}, inplace=True)
    wmape_df = pd.merge(
        left=sum_abs_residual, right=sum_actual, on=groupby_keys, how="left"
    )
    wmape_df["wmape"] = wmape_df["sum_abs_residual"] / wmape_df["sales"]

    return wmape_df

In [3]:
data_ca1 = import_downcasting(os.path.join(root, "data_with_arima_resid.csv"))
score_ca1 = import_downcasting(os.path.join(root, "arima_model_score.csv"))

In [4]:
print("data ca1 list columns")
print(data_ca1.info(verbose=True))

print("--------------------------------------------------")
print("data ca1 score list columns")
print(score_ca1.info())

data_ca1 = data_ca1.set_index("date")
data_ca1["event_name_1"] = (
    data_ca1["event_name_1"].cat.add_categories("none").fillna("none")
)
data_ca1["event_type_1"] = (
    data_ca1["event_type_1"].cat.add_categories("none").fillna("none")
)

data ca1 list columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5918109 entries, 0 to 5918108
Data columns (total 25 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   id              category      
 2   item_id         category      
 3   dept_id         category      
 4   cat_id          category      
 5   store_id        category      
 6   state_id        category      
 7   date_code       category      
 8   sales           int16         
 9   wm_yr_wk        int16         
 10  weekday         category      
 11  wday            int8          
 12  month           int8          
 13  year            int16         
 14  d               category      
 15  event_name_1    category      
 16  event_type_1    category      
 17  event_name_2    category      
 18  event_type_2    category      
 19  snap_CA         int8          
 20  snap_TX         int8          
 21  snap_WI         int8          
 

In [5]:
wmape = get_wmape(data_ca1, "arima_residual")
score_ca1 = pd.merge(
    left=score_ca1,
    right=wmape[["store_id", "item_id", "wmape"]],
    on=["store_id", "item_id"],
    how="left",
)

score_ca1.sort_values(by="wmape", ascending=False, inplace=True)
score_ca1["rank_pct_wmape"] = score_ca1["wmape"].rank(pct=True)
score_ca1 = score_ca1.reset_index(drop=True)

data_ca1["root_square_resid"] = data_ca1["arima_residual"].apply(
    lambda x: math.pow(x, 2)
)

In [6]:
item_id_cat_unq = data_ca1.groupby(["item_id", "cat_id"]).store_id.count().reset_index()
score_ca1 = pd.merge(
    left=score_ca1,
    right=item_id_cat_unq[["item_id", "cat_id"]],
    left_on=["item_id"],
    right_on=["item_id"],
    how="left",
)

score_ca1.info()
##########

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9147 entries, 0 to 9146
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   store_id        9147 non-null   category
 1   item_id         9147 non-null   category
 2   mse             9147 non-null   float16 
 3   mae             9147 non-null   float16 
 4   sse             9147 non-null   float32 
 5   wmape           9147 non-null   float64 
 6   rank_pct_wmape  9147 non-null   float64 
 7   cat_id          9147 non-null   category
dtypes: category(3), float16(2), float32(1), float64(2)
memory usage: 410.2 KB


In [8]:
# plotting_sales_forecast(data=data_ca1, score=score_ca1, file_name="plotting_store_ca1.pdf")
# plotting_sales_forecast(data=data_ca1, score=score_ca1, file_name="plotting_hobbies_ca1.pdf", product_cat="HOBBIES")
# plotting_sales_forecast(data=data_ca1, score=score_ca1, file_name="plotting_foods_ca1.pdf", product_cat="FOODS")
# plotting_sales_forecast(data=data_ca1, score=score_ca1, file_name="plotting_household_ca1.pdf",\
#                         product_cat="HOUSEHOLD")

# Analysis residual from ARIMA model in the group last 20 percentile

In [8]:
resid_score_worst_20pct = score_ca1[score_ca1.rank_pct_wmape > 0.8]
item_resid_worst_20pct = list(resid_score_worst_20pct.item_id.unique())

data_worst_20pct = data_ca1[data_ca1.item_id.isin(item_resid_worst_20pct)]
data_worst_20pct.to_csv(os.path.join(root, "data_arima_worst_30pct.csv"))

data_worst_20pct = import_downcasting(os.path.join(root, "data_arima_worst_30pct.csv"))

<span style="color:blue">Explore event_name_1 and event_type_1</span>

Finding from rank in event_name_1 that "christmas" event tend to have the most impact on residual

In [9]:
event_name_1 = (
    data_worst_20pct.groupby(["event_name_1"]).root_square_resid.mean().reset_index()
)
event_name_1.rename(columns={"root_square_resid": "mean_rss"}, inplace=True)
event_name_1.sort_values(by="mean_rss", ascending=False, inplace=True)


event_type_1 = (
    data_worst_20pct.groupby(["event_type_1"]).root_square_resid.mean().reset_index()
)
event_type_1.rename(columns={"root_square_resid": "mean_rss"}, inplace=True)
event_type_1.sort_values(by="mean_rss", ascending=False, inplace=True)

In [10]:
event_name_1.head(10)

Unnamed: 0,event_name_1,mean_rss
1,Christmas,80.474136
10,LaborDay,35.132126
27,Thanksgiving,26.928791
20,OrthodoxEaster,25.622793
4,Easter,24.203529
3,ColumbusDay,22.735889
5,Eid al-Fitr,22.226715
24,Ramadan starts,21.743202
30,none,21.446226
15,Mother's day,21.347801


In [11]:
<span style="color:blue">Explore event_name_1 and event_type_1</span>

SyntaxError: invalid syntax (483449040.py, line 1)