# Import necessary packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from random import sample

import re

from sklearn.metrics import mean_absolute_error, mean_squared_error

from timeit import default_timer as timer

# Import original data and preprocessed data

In [2]:
INPUT_DIR = r'C:\Users\ASUS\Desktop\Academic Work\Academic Research\DEMAND PATTERN\M5 Forecasting\Estimate the unit sales of Walmart retail goods\m5-forecasting-accuracy_unzip files'
sales_train_val = pd.read_csv(f'{INPUT_DIR}/sales_train_validation.csv')
calendar = pd.read_csv(f'{INPUT_DIR}/calendar.csv')

### Note: This Step is for the extraction of California State and Household category

In [3]:
sales_train_val = sales_train_val[(sales_train_val['state_id'] == 'CA') & (sales_train_val['cat_id'] == 'HOUSEHOLD')]

In [4]:
INPUT_DIR_2 = r'C:\Users\ASUS\Desktop\Academic Work\Academic Research\DEMAND PATTERN\CRUNCH TIME (WEEKLY AGGREGATION)\California\Household\Demand Pattern Categorization'
list_intermittent = pd.read_csv(f'{INPUT_DIR_2}/Intermittent_ID.csv')
list_lumpy = pd.read_csv(f'{INPUT_DIR_2}/Lumpy_ID.csv')
list_erratic = pd.read_csv(f'{INPUT_DIR_2}/Erratic_ID.csv')
list_smooth = pd.read_csv(f'{INPUT_DIR_2}/Smooth_ID.csv')

list_intermittent = list_intermittent['0'].values.tolist()
list_lumpy = list_lumpy['0'].values.tolist()
list_erratic = list_erratic['0'].values.tolist()
list_smooth = list_smooth['0'].values.tolist()

sales_intermittent = sales_train_val[sales_train_val.id.isin(list_intermittent)]
sales_lumpy = sales_train_val[sales_train_val.id.isin(list_lumpy)]
sales_erratic = sales_train_val[sales_train_val.id.isin(list_erratic)]
sales_smooth = sales_train_val[sales_train_val.id.isin(list_smooth)]

**Check by looking at column "id" to verify if we choose the right product (FOODS or HOBBIES or HOUSEHOLD)**

In [5]:
sales_erratic

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
565,HOUSEHOLD_1_001_CA_1_validation,HOUSEHOLD_1_001,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,3,0,0,0,...,1,0,1,1,1,0,0,0,0,1
593,HOUSEHOLD_1_029_CA_1_validation,HOUSEHOLD_1_029,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,0,0,0,0,...,0,0,4,2,0,2,0,0,1,0
627,HOUSEHOLD_1_065_CA_1_validation,HOUSEHOLD_1_065,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,0,0,0,0,...,0,0,0,3,2,0,0,0,1,1
673,HOUSEHOLD_1_113_CA_1_validation,HOUSEHOLD_1_113,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,0,0,0,0,...,1,2,3,7,1,1,2,2,2,2
692,HOUSEHOLD_1_132_CA_1_validation,HOUSEHOLD_1_132,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,0,0,0,0,...,0,0,0,0,2,0,3,2,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10208,HOUSEHOLD_1_506_CA_4_validation,HOUSEHOLD_1_506,HOUSEHOLD_1,HOUSEHOLD,CA_4,CA,1,0,0,0,...,3,0,0,2,1,1,0,1,0,2
10481,HOUSEHOLD_2_239_CA_4_validation,HOUSEHOLD_2_239,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,0,1,1,1,...,2,1,0,3,0,0,1,2,2,1
10543,HOUSEHOLD_2_301_CA_4_validation,HOUSEHOLD_2_301,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,1,2,1,1,...,0,0,0,3,0,0,0,0,0,3
10689,HOUSEHOLD_2_447_CA_4_validation,HOUSEHOLD_2_447,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,0,0,0,11,...,0,6,0,0,0,0,4,0,1,1


# User-defined functions to calculate Metrics and Algorithm

In [6]:
ROUNDING_DECIMAL = 4

def mase_calculation(ts, prediction):
    divisor = 0
    for i in range(1, ts.shape[0]):
        divisor = divisor + abs(ts.iloc[i] - ts.iloc[i-1])
    divisor = divisor/(ts.shape[0] - 1)
    diff    = abs(ts - prediction[:ts.shape[0]])/divisor
    mase    = diff.mean()
    return mase

def mape_calculation(actual, pred): 
    if not all([isinstance(actual, np.ndarray), isinstance(pred, np.ndarray)]):
        actual, pred = np.array(actual), np.array(pred)
    mask = (actual != 0)
    return round((np.fabs(actual - pred)/actual)[mask].mean()*100, ROUNDING_DECIMAL)

def wmape_calculation(actual, pred):
    if not all([isinstance(actual, np.ndarray), isinstance(pred, np.ndarray)]):
        actual, pred = np.array(actual), np.array(pred)
    return round((np.sum(np.absolute(actual-pred))/np.sum(actual))*100, ROUNDING_DECIMAL)

def smape_calculation(actual, predicted):
    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)
    return round(np.mean(np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual))/2))*100, ROUNDING_DECIMAL)

In [7]:
def Croston_TSB(ts,extra_periods=1,alpha=0.4,beta=0.4):
    d = np.array(ts) # Transform the input into a numpy array
    cols = len(d) # Historical period length
    d = np.append(d,[np.nan]*extra_periods) # Append np.nan into the demand array to cover future periods
    
    #level (a), probability(p) and forecast (f)
    a,p,f = np.full((3,cols+extra_periods),np.nan)
# Initialization
    first_occurence = np.argmax(d[:cols]>0)
    a[0] = d[first_occurence]
    p[0] = 1/(1 + first_occurence)
    f[0] = p[0]*a[0]
                 
    # Create all the t+1 forecasts
    for t in range(0,cols): 
        if d[t] > 0:
            a[t+1] = alpha*d[t] + (1-alpha)*a[t] 
            p[t+1] = beta*(1) + (1-beta)*p[t]  
        else:
            a[t+1] = a[t]
            p[t+1] = (1-beta)*p[t]       
        f[t+1] = p[t+1]*a[t+1]
        
    # Future Forecast
    a[cols+1:cols+extra_periods] = a[cols]
    p[cols+1:cols+extra_periods] = p[cols]
    f[cols+1:cols+extra_periods] = f[cols]
                      
    df = pd.DataFrame.from_dict({"Demand":d,"Forecast":f,"Period":p,"Level":a,"Error":d-f})
    return df

# Format sales data and then merge with calendar data

In [8]:
sales_pattern = sales_erratic.copy()

In [9]:
sales_pattern_py = sales_pattern.copy()

list_pattern_py = sales_pattern_py.id.unique().tolist()
list_pattern_py = [(re.sub(r'_validation', r"", x)) for x in list_pattern_py]

sales_pattern_py = sales_pattern_py.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], axis=1)
df_pattern = sales_pattern_py.melt(['id'], var_name='Date').pivot(index = ['Date'], columns = 'id', values = 'value').reset_index()
df_pattern.drop('Date', axis=1, inplace=True)
df_pattern.index = pd.date_range('2011-01-29', periods=df_pattern.shape[0], freq="D")
df_pattern.index.names = ['Date']
df_pattern = df_pattern.astype('float64')

df_pattern.iloc[:, 1:].columns = df_pattern.iloc[:, 1:].columns.str.rstrip('_validation')
df_pattern.columns = df_pattern.columns.str.replace(r'_validation', '')
df_pattern.reset_index(inplace=True)

calendar['Date'] = pd.to_datetime(calendar['date'])

dataframe = pd.merge(df_pattern,
                     calendar[['wm_yr_wk', 'Date']],
                     on = 'Date')

not_enough_seven_days = dataframe.groupby('wm_yr_wk').agg({'Date': 'count'})[dataframe.groupby('wm_yr_wk').agg({'Date': 'count'})['Date'] < 7].index.tolist()
dataframe = dataframe[~dataframe['wm_yr_wk'].isin(not_enough_seven_days)]

dataframe.set_index('wm_yr_wk', inplace=True)

dataframe.drop('Date', axis=1, inplace=True)

dataframe_weekly = dataframe.groupby('wm_yr_wk').sum().reset_index()

In [10]:
start_train_week = 11318
end_train_week = 11610
n_pred_weeks = 2
num_train_needed = 50
validation_training_ratio = 0.95
num_id_sample_for_tuning = 200

In [11]:
list_params_alpha = [round(item, 2) for item in list(np.arange(0.05, 0.95, 0.05))]
list_params_beta = [round(item, 2) for item in list(np.arange(0.05, 0.95, 0.05))]

In [20]:
len(list_pattern_py)

99

In [21]:
sample_list = sample(list_pattern_py, num_id_sample_for_tuning)

dataframe_check = dataframe_weekly[sample_list].iloc[-10:-2, :].T
dataframe_check['Check'] = np.where((dataframe_check==0).all(axis=1), 1, 0)

sample_removed = dataframe_check[dataframe_check['Check'] == 1].index.tolist()
sample_list_filtered = [x for x in sample_list if (x not in sample_removed)]

In [None]:
len(sample_list_filtered)

# Hyperparameter tuning for Croston TSB

In [12]:
train_data = dataframe_weekly[(dataframe_weekly['wm_yr_wk'] >= start_train_week) & (dataframe_weekly['wm_yr_wk'] <= end_train_week)]
test_data = dataframe_weekly.iloc[n_pred_weeks*-1:]

train_data_first_part = train_data.iloc[:int(len(train_data)*validation_training_ratio), :]
train_data_second_part = train_data.iloc[int(len(train_data)*validation_training_ratio):, :]

In [96]:
start = timer()

df_result_all_params = pd.DataFrame()

for product in sample_list_filtered:
    
    print('Currently Running: %s' % product)
    print('Progressing: {0} %'.format(round(sample_list_filtered.index(product) / len(sample_list_filtered) * 100, 2)))

    train_first = train_data_first_part[product]
    train_second = train_data_second_part[product]
    
    for i in list_params_alpha:
        for j in list_params_beta:
            predictions = list()
            history = [x for x in train_first[num_train_needed*-1:]]
            for t in train_second.index.tolist():
                yhat = Croston_TSB(history, extra_periods=1, alpha = i, beta = j)['Forecast'].iloc[-1]
                predictions.append(yhat)
                history.append(train_second[t])
            df_result_temp = pd.DataFrame({'Product': [product for count in range(len(train_second))],
                                           'Actual Data': train_second,
                                           'Forecast': predictions,
                                           'Alpha': [i for count_i in range(len(train_second))],
                                           'Beta': [j for count_j in range(len(train_second))]})
            df_result_all_params = df_result_all_params.append(df_result_temp, ignore_index=True)    
            
end = timer()
print('This line of code took {} minutes'.format((end-start) / 60))

Currently Running: FOODS_3_611_CA_3
Progressing: 0.0 %
Currently Running: FOODS_3_079_CA_1
Progressing: 0.6 %
Currently Running: FOODS_3_386_CA_3
Progressing: 1.2 %
Currently Running: FOODS_3_801_CA_1
Progressing: 1.8 %
Currently Running: FOODS_3_702_CA_3
Progressing: 2.4 %
Currently Running: FOODS_3_654_CA_3
Progressing: 2.99 %
Currently Running: FOODS_1_136_CA_3
Progressing: 3.59 %
Currently Running: FOODS_3_463_CA_1
Progressing: 4.19 %
Currently Running: FOODS_2_054_CA_3
Progressing: 4.79 %
Currently Running: FOODS_3_785_CA_1
Progressing: 5.39 %
Currently Running: FOODS_3_094_CA_4
Progressing: 5.99 %
Currently Running: FOODS_3_464_CA_3
Progressing: 6.59 %
Currently Running: FOODS_1_128_CA_1
Progressing: 7.19 %
Currently Running: FOODS_2_128_CA_3
Progressing: 7.78 %
Currently Running: FOODS_3_313_CA_4
Progressing: 8.38 %
Currently Running: FOODS_3_573_CA_3
Progressing: 8.98 %
Currently Running: FOODS_2_232_CA_1
Progressing: 9.58 %
Currently Running: FOODS_2_062_CA_3
Progressing: 10.1

Currently Running: FOODS_3_767_CA_2
Progressing: 86.83 %
Currently Running: FOODS_3_293_CA_3
Progressing: 87.43 %
Currently Running: FOODS_3_263_CA_2
Progressing: 88.02 %
Currently Running: FOODS_1_113_CA_1
Progressing: 88.62 %
Currently Running: FOODS_3_086_CA_3
Progressing: 89.22 %
Currently Running: FOODS_3_237_CA_4
Progressing: 89.82 %
Currently Running: FOODS_3_770_CA_1
Progressing: 90.42 %
Currently Running: FOODS_1_206_CA_4
Progressing: 91.02 %
Currently Running: FOODS_3_104_CA_3
Progressing: 91.62 %
Currently Running: FOODS_3_271_CA_3
Progressing: 92.22 %
Currently Running: FOODS_3_454_CA_2
Progressing: 92.81 %
Currently Running: FOODS_3_660_CA_1
Progressing: 93.41 %
Currently Running: FOODS_3_437_CA_2
Progressing: 94.01 %
Currently Running: FOODS_1_203_CA_2
Progressing: 94.61 %
Currently Running: FOODS_3_579_CA_4
Progressing: 95.21 %
Currently Running: FOODS_3_162_CA_3
Progressing: 95.81 %
Currently Running: FOODS_1_206_CA_3
Progressing: 96.41 %
Currently Running: FOODS_1_003_

In [97]:
df_result_all_params

Unnamed: 0,Product,Actual Data,Forecast,Alpha,Beta
0,FOODS_3_611_CA_3,2.0,6.343665,0.05,0.05
1,FOODS_3_611_CA_3,1.0,6.154780,0.05,0.05
2,FOODS_3_611_CA_3,6.0,5.927781,0.05,0.05
3,FOODS_3_611_CA_3,0.0,5.934435,0.05,0.05
4,FOODS_3_611_CA_3,0.0,5.637713,0.05,0.05
...,...,...,...,...,...
432859,FOODS_1_025_CA_4,5.0,5.097659,0.90,0.90
432860,FOODS_1_025_CA_4,6.0,5.009766,0.90,0.90
432861,FOODS_1_025_CA_4,6.0,5.900977,0.90,0.90
432862,FOODS_1_025_CA_4,2.0,5.990098,0.90,0.90


# Summay all metrics

In [13]:
def get_metrics_result_all_params(data):
    metrics_df = {}
    metrics_df['MASE'] = mase_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['WMAPE'] = wmape_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['SMAPE'] = smape_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['MAPE'] = mape_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['MAE'] = mean_absolute_error(data['Actual Data'], data['Forecast'])
    metrics_df['RMSE'] = np.sqrt(mean_squared_error(data['Actual Data'], data['Forecast']))
    return pd.Series(metrics_df)

In [102]:
start = timer()

df_result_metrics_all_params = df_result_all_params.groupby(['Product', 'Alpha', 'Beta']).apply(get_metrics_result_all_params).reset_index()

end = timer()
print('This line of code took {} minutes'.format((end-start) / 60))

This line of code took 1.9689070383333274 minutes


In [103]:
df_result_metrics_all_params

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE
0,FOODS_1_003_CA_4,0.05,0.05,0.560427,62.9445,65.9442,72.3028,4.563476,7.007761
1,FOODS_1_003_CA_4,0.05,0.10,0.562848,63.2165,66.3723,72.4344,4.583193,7.017876
2,FOODS_1_003_CA_4,0.05,0.15,0.564066,63.3532,66.5815,72.8710,4.593106,7.015064
3,FOODS_1_003_CA_4,0.05,0.20,0.564209,63.3693,66.5976,73.3494,4.594275,7.005703
4,FOODS_1_003_CA_4,0.05,0.25,0.563556,63.2960,66.4719,73.7393,4.588957,6.994132
...,...,...,...,...,...,...,...,...,...
54103,FOODS_3_822_CA_3,0.90,0.70,0.930215,18.6122,18.5345,18.5827,7.840387,8.682730
54104,FOODS_3_822_CA_3,0.90,0.75,0.930215,18.6122,18.5345,18.5827,7.840387,8.682730
54105,FOODS_3_822_CA_3,0.90,0.80,0.930215,18.6122,18.5345,18.5827,7.840387,8.682730
54106,FOODS_3_822_CA_3,0.90,0.85,0.930215,18.6122,18.5345,18.5827,7.840387,8.682730


# Check what products has unexpected metrics result

In [104]:
df_result_metrics_all_params[df_result_metrics_all_params['MASE'].isin([np.inf, -np.inf, np.nan])]

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE


In [105]:
df_result_metrics_all_params[df_result_metrics_all_params['WMAPE'].isin([np.inf, -np.inf, np.nan])]

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE


In [106]:
df_result_metrics_all_params[df_result_metrics_all_params['SMAPE'].isin([np.inf, -np.inf, np.nan])]

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE
34992,FOODS_3_437_CA_2,0.05,0.05,2.253534,94.0250,,94.8563,7.404468,8.510887
34993,FOODS_3_437_CA_2,0.05,0.10,2.135506,89.1005,,90.5489,7.016663,8.043784
34994,FOODS_3_437_CA_2,0.05,0.15,2.036404,84.9656,,86.8588,6.691042,7.659924
34995,FOODS_3_437_CA_2,0.05,0.20,1.953992,81.5271,,83.7256,6.420259,7.346998
34996,FOODS_3_437_CA_2,0.05,0.25,1.885650,78.6757,,81.0689,6.195708,7.092504
...,...,...,...,...,...,...,...,...,...
40495,FOODS_3_536_CA_2,0.90,0.70,0.957375,63.8250,,64.2111,0.957375,1.694428
40496,FOODS_3_536_CA_2,0.90,0.75,0.949219,63.2812,,64.3750,0.949219,1.678542
40497,FOODS_3_536_CA_2,0.90,0.80,0.938000,62.5333,,64.2667,0.938000,1.663716
40498,FOODS_3_536_CA_2,0.90,0.85,0.923719,61.5813,,63.8861,0.923719,1.649364


In [107]:
df_result_metrics_all_params[df_result_metrics_all_params['MAPE'].isin([np.inf, -np.inf, np.nan])]

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE


In [108]:
df_result_metrics_all_params[df_result_metrics_all_params['RMSE'].isin([np.inf, -np.inf, np.nan])]

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE


In [109]:
df_result_metrics_all_params[df_result_metrics_all_params['MAE'].isin([np.inf, -np.inf, np.nan])]

Unnamed: 0,Product,Alpha,Beta,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE


# Metrics Statistics

In [110]:
list_metrics = ['MASE', 'WMAPE', 'SMAPE', 'MAPE', 'MAE', 'RMSE']

In [111]:
for mts in list_metrics:
    print('Percentage of unexpected values of', mts, 'is: {}'.format(df_result_metrics_all_params[df_result_metrics_all_params[mts].isin([np.nan, np.inf, -np.inf])].Product.nunique() / df_result_metrics_all_params.Product.nunique() * 100), "%")

Percentage of unexpected values of MASE is: 0.0 %
Percentage of unexpected values of WMAPE is: 0.0 %
Percentage of unexpected values of SMAPE is: 1.7964071856287425 %
Percentage of unexpected values of MAPE is: 0.0 %
Percentage of unexpected values of MAE is: 0.0 %
Percentage of unexpected values of RMSE is: 0.0 %


### Filter all rows that have unexpected metrics values

In [112]:
df_result_metrics_all_params = df_result_metrics_all_params[~df_result_metrics_all_params.isin([np.nan, np.inf, -np.inf]).any(1)]

# Get MEAN metrics value of each Alpha and Beta

In [113]:
df_result_metrics_all_params.groupby(['Alpha', 'Beta'])['MASE', 'WMAPE', 'SMAPE', 'MAPE', 'MAE', 'RMSE'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE
Alpha,Beta,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.05,0.05,1.365862,96.041423,69.275729,115.234318,9.992048,11.916284
0.05,0.10,1.375113,96.140871,68.361171,121.771712,10.072386,11.898240
0.05,0.15,1.363192,94.652420,67.851638,124.435642,10.121441,11.904544
0.05,0.20,1.350067,93.153490,67.534338,125.610676,10.157972,11.925828
0.05,0.25,1.339353,91.934638,67.356555,126.176993,10.189558,11.957434
...,...,...,...,...,...,...,...
0.90,0.70,0.978410,62.292268,66.854880,76.694509,8.579088,11.075978
0.90,0.75,0.981174,62.624054,67.307037,77.170570,8.610422,11.119230
0.90,0.80,0.984559,62.999080,67.844584,77.728450,8.646776,11.166160
0.90,0.85,0.988255,63.408966,68.525621,78.405121,8.686604,11.216888


In [119]:
for mts in list_metrics:
    print('The optimum Alpha and Beta based on Mean', mts, 'is: {}'.format(df_result_metrics_all_params.groupby(['Alpha', 'Beta'])['MASE', 'WMAPE', 'SMAPE', 'MAPE', 'MAE', 'RMSE'].mean()[mts].idxmin()))

The optimum Alpha and Beta based on Mean MASE is: (0.65, 0.55)
The optimum Alpha and Beta based on Mean WMAPE is: (0.8, 0.05)
The optimum Alpha and Beta based on Mean SMAPE is: (0.4, 0.25)
The optimum Alpha and Beta based on Mean MAPE is: (0.85, 0.05)
The optimum Alpha and Beta based on Mean MAE is: (0.6, 0.1)
The optimum Alpha and Beta based on Mean RMSE is: (0.5, 0.2)


# Get MEDIAN metrics value of each Alpha and Beta

In [116]:
df_result_metrics_all_params.groupby(['Alpha', 'Beta'])['MASE', 'WMAPE', 'SMAPE', 'MAPE', 'MAE', 'RMSE'].median()

Unnamed: 0_level_0,Unnamed: 1_level_0,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE
Alpha,Beta,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.05,0.05,0.992446,55.14105,58.87090,71.17450,6.110863,7.553461
0.05,0.10,0.995244,55.44830,58.93780,69.54540,6.056406,7.229805
0.05,0.15,0.999935,55.51825,59.09255,68.28725,6.094567,7.270941
0.05,0.20,1.004608,55.47370,58.86340,67.62945,6.097078,7.374833
0.05,0.25,0.999935,55.48865,58.86595,67.48170,6.053492,7.523149
...,...,...,...,...,...,...,...
0.90,0.70,0.945261,54.07470,58.87100,56.94805,5.326244,6.551148
0.90,0.75,0.946647,54.22745,59.38335,56.95270,5.397904,6.585305
0.90,0.80,0.953301,54.22745,60.51820,57.55680,5.469507,6.606090
0.90,0.85,0.960060,54.65235,60.51820,58.72170,5.525182,6.683523


In [118]:
for mts in list_metrics:
    print('The optimum Alpha and Beta based on Median of', mts, 'is: {}'.format(df_result_metrics_all_params.groupby(['Alpha', 'Beta'])['MASE', 'WMAPE', 'SMAPE', 'MAPE', 'MAE', 'RMSE'].median()[mts].idxmin()))

The optimum Alpha and Beta based on Median of MASE is: (0.3, 0.1)
The optimum Alpha and Beta based on Median of WMAPE is: (0.45, 0.5)
The optimum Alpha and Beta based on Median of SMAPE is: (0.5, 0.35)
The optimum Alpha and Beta based on Median of MAPE is: (0.85, 0.05)
The optimum Alpha and Beta based on Median of MAE is: (0.25, 0.5)
The optimum Alpha and Beta based on Median of RMSE is: (0.5, 0.35)


# Run Croston TSB model after decide best paramaters

In [14]:
best_alpha = 0.85
best_beta = 0.05

In [15]:
len(list_pattern_py)

126

In [16]:
start = timer()

df_result_best_params = pd.DataFrame()
for product in list_pattern_py:
    train_product = train_data[product]
    test_product = test_data[product]
    predictions = list()
    history = [x for x in train_product[num_train_needed*-1:]]
    for t in test_product.index.tolist():
        yhat = Croston_TSB(history, extra_periods=1, alpha=best_alpha, beta = best_beta)['Forecast'].iloc[-1]
        predictions.append(yhat)
        history.append(test_product[t])
    df_result_temp = pd.DataFrame({'Product': [product for count in range(len(test_product))],
                                    'Actual Data': test_product,
                                    'Forecast': predictions})
    df_result_best_params = df_result_best_params.append(df_result_temp, ignore_index=True)
    
end = timer()
print('This line of code took {} minutes'.format((end-start) / 60))

This line of code took 0.0034513649999999767 minutes


In [17]:
df_result_best_params

Unnamed: 0,Product,Actual Data,Forecast
0,HOUSEHOLD_1_001_CA_1,2.0,0.798027
1,HOUSEHOLD_1_001_CA_1,1.0,1.469929
2,HOUSEHOLD_1_029_CA_1,5.0,7.938064
3,HOUSEHOLD_1_029_CA_1,9.0,4.801012
4,HOUSEHOLD_1_065_CA_1,24.0,27.110473
...,...,...,...
247,HOUSEHOLD_2_301_CA_4,1.0,3.031549
248,HOUSEHOLD_2_447_CA_4,5.0,0.895726
249,HOUSEHOLD_2_447_CA_4,2.0,3.342817
250,HOUSEHOLD_2_512_CA_4,1.0,1.552138


In [18]:
df_result_final = df_result_best_params.groupby('Product').apply(get_metrics_result_all_params).reset_index()

In [19]:
df_result_final

Unnamed: 0,Product,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE
0,HOUSEHOLD_1_001_CA_1,0.835951,55.7301,61.9839,53.5458,0.835951,0.912571
1,HOUSEHOLD_1_002_CA_2,inf,51.0628,36.6492,51.0628,2.042513,2.580460
2,HOUSEHOLD_1_004_CA_2,inf,inf,200.0000,,9.936303,9.939569
3,HOUSEHOLD_1_008_CA_2,inf,37.4445,28.8938,37.4445,1.123335,1.430552
4,HOUSEHOLD_1_028_CA_2,1.068005,35.6002,37.3842,35.0832,2.136010,2.202467
...,...,...,...,...,...,...,...
121,HOUSEHOLD_2_498_CA_3,,,,,0.000000,0.000000
122,HOUSEHOLD_2_499_CA_2,,,,,0.000000,0.000000
123,HOUSEHOLD_2_511_CA_3,1.575730,121.2100,85.0806,172.2065,7.878652,8.982051
124,HOUSEHOLD_2_512_CA_4,0.566252,75.5002,87.6887,67.3856,2.265006,2.839748


In [20]:
df_result_final.isnull().sum()

Product     0
MASE       22
WMAPE      22
SMAPE      22
MAPE       28
MAE         0
RMSE        0
dtype: int64

In [21]:
df_result_final.MAPE.mean()

85.54015510204081

In [22]:
df_result_final.to_csv('Croston_TSB_Erratic_Metrics.csv')