# Import necessary packages

In [45]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from random import sample

import re

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV 

from sklearn.metrics import mean_absolute_error, mean_squared_error

from timeit import default_timer as timer

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import original data and preprocessed data

In [47]:
sales_train_val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_train_validation.csv')
calendar = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/calendar.csv')

In [48]:
sales_train_val = sales_train_val[(sales_train_val['state_id'] == 'CA') & (sales_train_val['cat_id'] == 'HOUSEHOLD')]

In [49]:
INPUT_DIR_2 = '/content/drive/MyDrive/Colab Notebooks/List of Product ID according to 4 demand patterns/California/Household/2-year'
list_intermittent = pd.read_csv(f'{INPUT_DIR_2}/Intermittent_ID_2_Year_Data.csv')
list_lumpy = pd.read_csv(f'{INPUT_DIR_2}/Lumpy_ID_2_Year_Data.csv')
list_erratic = pd.read_csv(f'{INPUT_DIR_2}/Erratic_ID_2_Year_Data.csv')
list_smooth = pd.read_csv(f'{INPUT_DIR_2}/Smooth_ID_2_Year_Data.csv')

list_intermittent = list_intermittent['0'].values.tolist()
list_lumpy = list_lumpy['0'].values.tolist()
list_erratic = list_erratic['0'].values.tolist()
list_smooth = list_smooth['0'].values.tolist()

sales_intermittent = sales_train_val[sales_train_val.id.isin(list_intermittent)]
sales_lumpy = sales_train_val[sales_train_val.id.isin(list_lumpy)]
sales_erratic = sales_train_val[sales_train_val.id.isin(list_erratic)]
sales_smooth = sales_train_val[sales_train_val.id.isin(list_smooth)]

In [50]:
sales_lumpy

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
568,HOUSEHOLD_1_004_CA_1_validation,HOUSEHOLD_1_004,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,4,0,8,11,...,0,5,0,0,0,0,0,0,1,0
596,HOUSEHOLD_1_033_CA_1_validation,HOUSEHOLD_1_033,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,3,2,3,3,...,0,0,2,2,1,2,1,2,4,5
608,HOUSEHOLD_1_046_CA_1_validation,HOUSEHOLD_1_046,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,2,1,1,1,...,5,3,2,1,2,7,0,0,0,0
627,HOUSEHOLD_1_065_CA_1_validation,HOUSEHOLD_1_065,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,0,0,0,0,...,0,0,0,3,2,0,0,0,1,1
631,HOUSEHOLD_1_069_CA_1_validation,HOUSEHOLD_1_069,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,2,0,4,0,...,2,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10637,HOUSEHOLD_2_395_CA_4_validation,HOUSEHOLD_2_395,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,0,0,0,0,...,1,9,0,0,6,0,0,0,2,0
10660,HOUSEHOLD_2_418_CA_4_validation,HOUSEHOLD_2_418,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
10689,HOUSEHOLD_2_447_CA_4_validation,HOUSEHOLD_2_447,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,0,0,0,11,...,0,6,0,0,0,0,4,0,1,1
10727,HOUSEHOLD_2_485_CA_4_validation,HOUSEHOLD_2_485,HOUSEHOLD_2,HOUSEHOLD,CA_4,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# User-defined functions to calculate Metrics

In [51]:
ROUNDING_DECIMAL = 4

def mase_calculation(ts, prediction):
    divisor = 0
    for i in range(1, ts.shape[0]):
        divisor = divisor + abs(ts.iloc[i] - ts.iloc[i-1])
    divisor = divisor/(ts.shape[0] - 1)
    diff    = abs(ts - prediction[:ts.shape[0]])/divisor
    mase    = diff.mean()
    return mase

def mape_calculation(actual, pred): 
    if not all([isinstance(actual, np.ndarray), isinstance(pred, np.ndarray)]):
        actual, pred = np.array(actual), np.array(pred)
    mask = (actual != 0)
    return round((np.fabs(actual - pred)/actual)[mask].mean()*100, ROUNDING_DECIMAL)

def wmape_calculation(actual, pred):
    if not all([isinstance(actual, np.ndarray), isinstance(pred, np.ndarray)]):
        actual, pred = np.array(actual), np.array(pred)
    return round((np.sum(np.absolute(actual-pred))/np.sum(actual))*100, ROUNDING_DECIMAL)

def smape_calculation(actual, predicted):
    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)
    return round(np.mean(np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual))/2))*100, ROUNDING_DECIMAL)

# Format sales data and then merge with calendar data

In [52]:
def data_cleaning(sales_pattern):

    sales_pattern_py = sales_pattern.copy()
    list_pattern_py = sales_pattern_py.id.unique().tolist()

    sales_pattern_py = sales_pattern_py.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], axis=1)
    df_pattern = sales_pattern_py.melt(['id'], var_name='Date').pivot(index = ['Date'], columns = 'id', values = 'value').reset_index()
    df_pattern.drop('Date', axis=1, inplace=True)
    df_pattern.index = pd.date_range('2011-01-29', periods=df_pattern.shape[0], freq="D")
    df_pattern.index.names = ['Date']
    df_pattern = df_pattern.astype('float64')

    df_pattern.iloc[:, 1:].columns = df_pattern.iloc[:, 1:].columns.str.rstrip('_validation')
    df_pattern.columns = df_pattern.columns.str.replace(r'_validation', '')
    df_pattern.reset_index(inplace=True)

    calendar['Date'] = pd.to_datetime(calendar['date'])

    dataframe = pd.merge(df_pattern,
                         calendar[['wday', 'month', 'year', 'event_name_1', 'event_type_1', 'snap_CA', 'Date']],
                         on = 'Date')

    dataframe['weekend'] = np.where(((dataframe['wday'] == 1) | (dataframe['wday'] == 2)), 1, 0)
    dataframe.fillna('No event', inplace=True)
    dataframe['year'] = dataframe['year'].apply(str)

    dataframe['quarter'] = dataframe['Date'].dt.quarter
    dataframe['quarter'] = dataframe['quarter'].apply(str)

    dataframe = pd.get_dummies(dataframe)
    
    return dataframe

***Decide which pattern we are dealing with?:***

---
INTERMITTENT
---
LUMPY
---
ERRATIC
---
SMOOTH
---



In [53]:
data_pattern = data_cleaning(sales_smooth)

In [54]:
data_pattern

Unnamed: 0,Date,HOUSEHOLD_1_003_CA_3,HOUSEHOLD_1_004_CA_3,HOUSEHOLD_1_007_CA_1,HOUSEHOLD_1_007_CA_3,HOUSEHOLD_1_018_CA_3,HOUSEHOLD_1_019_CA_1,HOUSEHOLD_1_019_CA_2,HOUSEHOLD_1_019_CA_3,HOUSEHOLD_1_027_CA_1,...,event_name_1_VeteransDay,event_type_1_Cultural,event_type_1_National,event_type_1_No event,event_type_1_Religious,event_type_1_Sporting,quarter_1,quarter_2,quarter_3,quarter_4
0,2011-01-29,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0,0,0,1,0,0,1,0,0,0
1,2011-01-30,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,1,0,0,1,0,0,0
2,2011-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0,0,0,1,0,0,1,0,0,0
3,2011-02-01,0.0,3.0,4.0,3.0,0.0,5.0,6.0,27.0,1.0,...,0,0,0,1,0,0,1,0,0,0
4,2011-02-02,0.0,4.0,3.0,1.0,0.0,2.0,11.0,30.0,2.0,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1908,2016-04-20,2.0,1.0,1.0,1.0,0.0,5.0,15.0,34.0,5.0,...,0,0,0,1,0,0,0,1,0,0
1909,2016-04-21,0.0,11.0,2.0,0.0,0.0,8.0,9.0,17.0,6.0,...,0,0,0,1,0,0,0,1,0,0
1910,2016-04-22,0.0,3.0,3.0,4.0,0.0,5.0,2.0,37.0,6.0,...,0,0,0,1,0,0,0,1,0,0
1911,2016-04-23,0.0,5.0,1.0,2.0,0.0,5.0,4.0,36.0,1.0,...,0,0,0,1,0,0,0,1,0,0


# Random Forest Hyperparameters to find optimum parameters

In [55]:
not_id_cols = [
 'Date',
 'wday',
 'month',
 'snap_CA',
 'weekend',
 'year_2011',
 'year_2012',
 'year_2013',
 'year_2014',
 'year_2015',
 'year_2016',
 'event_name_1_Chanukah End',
 'event_name_1_Christmas',
 'event_name_1_Cinco De Mayo',
 'event_name_1_ColumbusDay',
 'event_name_1_Easter',
 'event_name_1_Eid al-Fitr',
 'event_name_1_EidAlAdha',
 "event_name_1_Father's day",
 'event_name_1_Halloween',
 'event_name_1_IndependenceDay',
 'event_name_1_LaborDay',
 'event_name_1_LentStart',
 'event_name_1_LentWeek2',
 'event_name_1_MartinLutherKingDay',
 'event_name_1_MemorialDay',
 "event_name_1_Mother's day",
 'event_name_1_NBAFinalsEnd',
 'event_name_1_NBAFinalsStart',
 'event_name_1_NewYear',
 'event_name_1_No event',
 'event_name_1_OrthodoxChristmas',
 'event_name_1_OrthodoxEaster',
 'event_name_1_Pesach End',
 'event_name_1_PresidentsDay',
 'event_name_1_Purim End',
 'event_name_1_Ramadan starts',
 'event_name_1_StPatricksDay',
 'event_name_1_SuperBowl',
 'event_name_1_Thanksgiving',
 'event_name_1_ValentinesDay',
 'event_name_1_VeteransDay',
 'event_type_1_Cultural',
 'event_type_1_National',
 'event_type_1_No event',
 'event_type_1_Religious',
 'event_type_1_Sporting',
 'quarter_1',
 'quarter_2',
 'quarter_3',
 'quarter_4'
]

In [56]:
end_train_date = '2016-04-10'

In [57]:
lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
moving_averages = [2, 3, 4, 5, 6, 7, 14, 21, 28]  
stds = [2, 3, 4, 5, 6, 7, 14, 21, 28]  
maximums = [2, 3, 4, 5, 6, 7, 14, 21, 28]  
minimums = [2, 3, 4, 5, 6, 7, 14, 21, 28]  
totals = [2, 3, 4, 5, 6, 7, 14, 21, 28]
num_days_zeros = [2, 3, 4, 5, 6, 7, 14, 21, 28] 

In [58]:
correlation_threshold = 0.8

In [59]:
n_estimators = [50, 100, 200, 300]
max_depth = list(range(4,15)) + [None]
min_samples_leaf = range(2,12)
max_features = range(10,30)
bootstrap = [True] #We force bootstrap
max_samples = [0.6, 0.7, 0.8, 0.9, 1]

In [60]:
# User-defined funtion to find optimum parameters on training data
def Random_Forest_Pipeline(list_pattern, not_id_cols, lags, moving_averages, stds,
                          maximums, minimums, totals, num_days_zeros, correlation_threshold,
                          end_train_date, n_estimators, max_depth, min_samples_leaf,
                          max_features, bootstrap, max_samples):
    
    list_pattern_temp = list_pattern.copy()
    list_pattern_temp = [(re.sub(r'_validation', r"", x)) for x in list_pattern_temp]
    
    df_output_hyper_params = pd.DataFrame()
    
    for product in list_pattern_temp:
        print('Currently Running Product: %s' % product)
        print('Progressing: {0} %'.format(round(list_pattern_temp.index(product) / len(list_pattern_temp) * 100, 2)))
        dataframe_product = pd.concat([data_pattern[[product]], data_pattern[not_id_cols]], axis=1, ignore_index=False)

        # Create Lag variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (t-{lag})': dataframe_product[product].shift(lag)
            for lag in lags
        })

        # Create Rolling Moving Average variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (mva_{moving_average})': dataframe_product[product].shift(1).rolling(moving_average).mean()
            for moving_average in moving_averages
        })

        # Create Rolling Standard Deviation variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (std_{std})': dataframe_product[product].shift(1).rolling(std).std()
            for std in stds
        })

        # Create Rolling Maximum variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (max_{maximum})': dataframe_product[product].shift(1).rolling(maximum).max()
            for maximum in maximums
        })

        # Create Rolling Minimum variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (min_{minimum})': dataframe_product[product].shift(1).rolling(minimum).min()
            for minimum in minimums
        })

        # Create Rolling Total variables 
        dataframe_product = dataframe_product.assign(**{
            f'{product} (total_{total})': dataframe_product[product].shift(1).rolling(total).sum()
            for total in totals
        })

        # Create Zero Count variables
        count_func = lambda x: (x==0).sum()

        dataframe_product = dataframe_product.assign(**{
            f'{product} (zero_count_in_{num_days_zero})': dataframe_product[product].shift(1).rolling(num_days_zero).apply(count_func)
            for num_days_zero in num_days_zeros
        })
        
        dataframe_product.dropna(inplace=True)
        dataframe_product.set_index('Date', inplace=True)

        train_data = dataframe_product[dataframe_product.index <= end_train_date]

        corr_matrix = train_data.loc[:, train_data.columns != product].corr().abs()
        high_corr_var=np.where(corr_matrix > correlation_threshold)
        high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
        
        train_data.drop([i[1] for i in high_corr_var], axis=1, inplace=True)
        
        X_train = train_data.drop(product, axis=1)
        y_train = train_data[product]

        param_dist = {'n_estimators': n_estimators,
                      'max_depth': max_depth,
                      'min_samples_leaf': min_samples_leaf,
                      'max_features': max_features,
                      'bootstrap': bootstrap,
                      'max_samples': max_samples}

        forest = RandomForestRegressor(n_jobs=1)

        forest_cv = RandomizedSearchCV(forest, param_dist, cv=5, n_jobs=-1, verbose=2, n_iter=300, scoring='neg_mean_absolute_error')
        forest_cv.fit(X_train,y_train)
            
        best_params_temp = forest_cv.best_params_
        
        df_output_temp = pd.DataFrame.from_dict(best_params_temp, orient='index').T
        df_output_temp['Product'] = product
        
        df_output_hyper_params = df_output_hyper_params.append(df_output_temp, ignore_index=True)
        
    return df_output_hyper_params

***Decide which pattern we are dealing with?:***

---
INTERMITTENT
---
LUMPY
---
ERRATIC
---
SMOOTH
---



In [61]:
list_pattern_sample = sample(list_smooth, 30)

In [62]:
list_pattern_sample

['HOUSEHOLD_1_071_CA_3_validation',
 'HOUSEHOLD_1_351_CA_3_validation',
 'HOUSEHOLD_1_180_CA_1_validation',
 'HOUSEHOLD_1_347_CA_1_validation',
 'HOUSEHOLD_1_307_CA_3_validation',
 'HOUSEHOLD_1_404_CA_3_validation',
 'HOUSEHOLD_1_107_CA_3_validation',
 'HOUSEHOLD_1_165_CA_3_validation',
 'HOUSEHOLD_1_040_CA_1_validation',
 'HOUSEHOLD_1_541_CA_2_validation',
 'HOUSEHOLD_1_179_CA_3_validation',
 'HOUSEHOLD_1_096_CA_2_validation',
 'HOUSEHOLD_1_066_CA_1_validation',
 'HOUSEHOLD_1_234_CA_3_validation',
 'HOUSEHOLD_2_239_CA_3_validation',
 'HOUSEHOLD_1_536_CA_1_validation',
 'HOUSEHOLD_1_243_CA_1_validation',
 'HOUSEHOLD_1_169_CA_2_validation',
 'HOUSEHOLD_1_537_CA_2_validation',
 'HOUSEHOLD_1_473_CA_3_validation',
 'HOUSEHOLD_1_399_CA_3_validation',
 'HOUSEHOLD_1_256_CA_3_validation',
 'HOUSEHOLD_1_410_CA_1_validation',
 'HOUSEHOLD_1_338_CA_2_validation',
 'HOUSEHOLD_1_217_CA_3_validation',
 'HOUSEHOLD_1_198_CA_1_validation',
 'HOUSEHOLD_1_334_CA_4_validation',
 'HOUSEHOLD_1_225_CA_2_valid

In [63]:
start = timer()

df_hyper_params = Random_Forest_Pipeline(list_pattern_sample, not_id_cols, lags, moving_averages, stds,
                                         maximums, minimums, totals, num_days_zeros, correlation_threshold,
                                         end_train_date, n_estimators, max_depth, min_samples_leaf,
                                         max_features, bootstrap, max_samples)

end = timer()

print('This line of code took {} minutes'.format((end-start) / 60))

Currently Running Product: HOUSEHOLD_1_071_CA_3
Progressing: 0.0 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_351_CA_3
Progressing: 3.33 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_180_CA_1
Progressing: 6.67 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_347_CA_1
Progressing: 10.0 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_307_CA_3
Progressing: 13.33 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_404_CA_3
Progressing: 16.67 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_107_CA_3
Progressing: 20.0 %
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Currently Running Product: HOUSEHOLD_1_165_CA_3
Progressing: 23.33 %
Fittin

In [64]:
df_hyper_params

Unnamed: 0,n_estimators,min_samples_leaf,max_samples,max_features,max_depth,bootstrap,Product
0,50,6,0.6,28,12.0,True,HOUSEHOLD_1_071_CA_3
1,50,8,0.7,11,7.0,True,HOUSEHOLD_1_351_CA_3
2,50,7,0.6,28,9.0,True,HOUSEHOLD_1_180_CA_1
3,200,11,0.6,12,9.0,True,HOUSEHOLD_1_347_CA_1
4,50,6,0.7,28,14.0,True,HOUSEHOLD_1_307_CA_3
5,100,3,0.6,18,9.0,True,HOUSEHOLD_1_404_CA_3
6,50,4,0.6,23,6.0,True,HOUSEHOLD_1_107_CA_3
7,100,11,0.6,19,7.0,True,HOUSEHOLD_1_165_CA_3
8,50,2,0.7,27,9.0,True,HOUSEHOLD_1_040_CA_1
9,50,7,0.8,20,13.0,True,HOUSEHOLD_1_541_CA_2


In [65]:
df_hyper_params.to_csv('Random_Forest_Hyperparams_Smooth.csv')

# Decide optimum set of parameters for Random Forest algorithm

In [66]:
# See if any values larger than 1 in column "Product"
# If yes, the corresponding set of parameters occur more than once and those sets should be considered first
df_hyper_params.groupby(['n_estimators', 'min_samples_leaf', 'max_samples', 
                         'max_features', 'max_depth', 'bootstrap'])[['Product']].count().sort_values(by = 'Product', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Product
n_estimators,min_samples_leaf,max_samples,max_features,max_depth,bootstrap,Unnamed: 6_level_1
50,2,0.7,27,9,True,1
50,9,0.8,25,11,True,1
300,2,0.6,20,5,True,1
200,11,0.6,12,9,True,1
200,9,0.6,25,13,True,1
100,11,0.6,28,7,True,1
100,11,0.6,19,7,True,1
100,8,0.6,27,12,True,1
100,6,0.8,29,14,True,1
100,4,0.8,27,5,True,1


In [67]:
# See what value of parameter "n_estimators" occurs the most frequently
df_hyper_params.groupby(['n_estimators'])[['Product']].count()

Unnamed: 0_level_0,Product
n_estimators,Unnamed: 1_level_1
50,18
100,8
200,2
300,2


In [68]:
# See what value of parameter "min_samples_leaf" occurs the most frequently
df_hyper_params.groupby(['min_samples_leaf'])[['Product']].count()

Unnamed: 0_level_0,Product
min_samples_leaf,Unnamed: 1_level_1
2,3
3,3
4,3
5,2
6,5
7,2
8,4
9,3
10,2
11,3


In [69]:
# See what value of parameter "max_samples" occurs the most frequently
df_hyper_params.groupby(['max_samples'])[['Product']].count()

Unnamed: 0_level_0,Product
max_samples,Unnamed: 1_level_1
0.6,14
0.7,6
0.8,7
0.9,2
1.0,1


In [70]:
# See what value of parameter "max_features" occurs the most frequently
df_hyper_params.groupby(['max_features'])[['Product']].count()

Unnamed: 0_level_0,Product
max_features,Unnamed: 1_level_1
10,1
11,1
12,2
13,1
14,2
15,1
18,1
19,2
20,3
21,2


In [71]:
# See what value of parameter "max_depth" occurs the most frequently
df_hyper_params.groupby(['max_depth'])[['Product']].count()

Unnamed: 0_level_0,Product
max_depth,Unnamed: 1_level_1
4,2
5,4
6,3
7,3
9,4
10,1
11,2
12,4
13,2
14,4


# Implement Random Forest on Test Set with optimum set of parameters

In [72]:
n_estimators_chosen = 50
min_samples_leaf_chosen = 5
max_samples_choose = 0.6
max_features_choose = 21
max_depth_choose = 6
bootstrap = [True]

In [73]:
# User-defined function to run Random Forest on Test Data

def Random_Forest_Test_Data(list_pattern, not_id_cols, lags, moving_averages, stds,
                          maximums, minimums, totals, num_days_zeros, correlation_threshold,
                          end_train_date, n_estimators_chosen, min_samples_leaf_chosen,
                          max_samples_choose, max_features_choose, max_depth_choose, bootstrap):
    
    list_pattern_temp = list_pattern.copy()
    list_pattern_temp = [(re.sub(r'_validation', r"", x)) for x in list_pattern_temp]
    
    df_output_rf_pattern = pd.DataFrame()
    
    for product in list_pattern_temp:
        print('Currently Running Product: %s' % product)
        print('Progressing: {0} %'.format(round(list_pattern_temp.index(product) / len(list_pattern_temp) * 100, 2)))

        dataframe_product = pd.concat([data_pattern[[product]], data_pattern[not_id_cols]], axis=1, ignore_index=False)

        # Create Lag variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (t-{lag})': dataframe_product[product].shift(lag)
            for lag in lags
        })

        # Create Rolling Moving Average variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (mva_{moving_average})': dataframe_product[product].shift(1).rolling(moving_average).mean()
            for moving_average in moving_averages
        })

        # Create Rolling Standard Deviation variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (std_{std})': dataframe_product[product].shift(1).rolling(std).std()
            for std in stds
        })

        # Create Rolling Maximum variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (max_{maximum})': dataframe_product[product].shift(1).rolling(maximum).max()
            for maximum in maximums
        })

        # Create Rolling Minimum variables
        dataframe_product = dataframe_product.assign(**{
            f'{product} (min_{minimum})': dataframe_product[product].shift(1).rolling(minimum).min()
            for minimum in minimums
        })

        # Create Rolling Total variables 
        dataframe_product = dataframe_product.assign(**{
            f'{product} (total_{total})': dataframe_product[product].shift(1).rolling(total).sum()
            for total in totals
        })

        # Create Zero Count variables
        count_func = lambda x: (x==0).sum()

        dataframe_product = dataframe_product.assign(**{
            f'{product} (zero_count_in_{num_days_zero})': dataframe_product[product].shift(1).rolling(num_days_zero).apply(count_func)
            for num_days_zero in num_days_zeros
        })
        
        dataframe_product.dropna(inplace=True)
        dataframe_product.set_index('Date', inplace=True)

        train_data = dataframe_product[dataframe_product.index <= end_train_date]
        test_data = dataframe_product[dataframe_product.index > end_train_date]

        corr_matrix = train_data.loc[:, train_data.columns != product].corr().abs()
        high_corr_var=np.where(corr_matrix > correlation_threshold)
        high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
        
        train_data.drop([i[1] for i in high_corr_var], axis=1, inplace=True)
        test_data.drop([i[1] for i in high_corr_var], axis=1, inplace=True)
        
        X_train = train_data.drop(product, axis=1)
        y_train = train_data[product]

        X_test = test_data.drop(product, axis=1)
        y_test = test_data[product]

        forest = RandomForestRegressor(n_jobs=1,
                                      n_estimators = n_estimators_chosen,
                                      min_samples_leaf = min_samples_leaf_chosen,
                                      max_samples = max_samples_choose,
                                      max_features = max_features_choose,
                                      max_depth = max_depth_choose,
                                      bootstrap = bootstrap,
                                      random_state = 0)

        forest.fit(X_train,y_train)
        y_pred = forest.predict(X_test)
        df_output_temp = pd.DataFrame({'Actual Data': y_test, 
                                       'Forecast': y_pred, 
                                       'Product': [product for num_rows in range(len(y_pred))]
                                       })
        
        df_output_rf_pattern = df_output_rf_pattern.append(df_output_temp, ignore_index=False)
        
    return df_output_rf_pattern

***Decide which pattern we are dealing with?:***

---
INTERMITTENT
---
LUMPY
---
ERRATIC
---
SMOOTH
---



In [74]:
# Start running Random Forest on Test Set

start = timer()

df_forecast_values = Random_Forest_Test_Data(list_smooth, not_id_cols, lags, moving_averages, stds,
                                             maximums, minimums, totals, num_days_zeros, correlation_threshold,
                                             end_train_date, n_estimators_chosen, min_samples_leaf_chosen,
                                             max_samples_choose, max_features_choose, max_depth_choose, bootstrap)

end = timer()

print('This line of code took {} minutes'.format((end-start) / 60))

Currently Running Product: HOUSEHOLD_1_003_CA_3
Progressing: 0.0 %
Currently Running Product: HOUSEHOLD_1_004_CA_3
Progressing: 0.26 %
Currently Running Product: HOUSEHOLD_1_007_CA_1
Progressing: 0.52 %
Currently Running Product: HOUSEHOLD_1_007_CA_3
Progressing: 0.78 %
Currently Running Product: HOUSEHOLD_1_018_CA_3
Progressing: 1.04 %
Currently Running Product: HOUSEHOLD_1_019_CA_1
Progressing: 1.3 %
Currently Running Product: HOUSEHOLD_1_019_CA_2
Progressing: 1.55 %
Currently Running Product: HOUSEHOLD_1_019_CA_3
Progressing: 1.81 %
Currently Running Product: HOUSEHOLD_1_027_CA_1
Progressing: 2.07 %
Currently Running Product: HOUSEHOLD_1_027_CA_2
Progressing: 2.33 %
Currently Running Product: HOUSEHOLD_1_027_CA_3
Progressing: 2.59 %
Currently Running Product: HOUSEHOLD_1_028_CA_3
Progressing: 2.85 %
Currently Running Product: HOUSEHOLD_1_029_CA_3
Progressing: 3.11 %
Currently Running Product: HOUSEHOLD_1_030_CA_3
Progressing: 3.37 %
Currently Running Product: HOUSEHOLD_1_033_CA_2
Pr

In [75]:
df_forecast_values

Unnamed: 0_level_0,Actual Data,Forecast,Product
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-04-11,0.0,1.664225,HOUSEHOLD_1_003_CA_3
2016-04-12,2.0,1.619430,HOUSEHOLD_1_003_CA_3
2016-04-13,3.0,1.568051,HOUSEHOLD_1_003_CA_3
2016-04-14,0.0,1.686539,HOUSEHOLD_1_003_CA_3
2016-04-15,1.0,1.584822,HOUSEHOLD_1_003_CA_3
...,...,...,...
2016-04-20,20.0,0.474663,HOUSEHOLD_2_513_CA_3
2016-04-21,8.0,3.731621,HOUSEHOLD_2_513_CA_3
2016-04-22,2.0,4.161626,HOUSEHOLD_2_513_CA_3
2016-04-23,6.0,3.303114,HOUSEHOLD_2_513_CA_3


In [76]:
df_forecast_values.to_csv('Random Forest_Smooth_Forecast_Values.csv')

In [77]:
# User-defined funcion to calculate metrics

def get_metrics_result_all_params(data):
    metrics_df = {}
    metrics_df['MASE'] = mase_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['WMAPE'] = wmape_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['SMAPE'] = smape_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['MAPE'] = mape_calculation(data['Actual Data'], data['Forecast'])
    metrics_df['MAE'] = mean_absolute_error(data['Actual Data'], data['Forecast'])
    metrics_df['RMSE'] = np.sqrt(mean_squared_error(data['Actual Data'], data['Forecast']))
    return pd.Series(metrics_df)

In [78]:
start = timer()

df_forecast_metrics = df_forecast_values.groupby('Product').apply(get_metrics_result_all_params).reset_index()

end = timer()
print('This line of code took {} minutes'.format((end-start) / 60))

This line of code took 0.014251464066668025 minutes


In [79]:
df_forecast_metrics

Unnamed: 0,Product,MASE,WMAPE,SMAPE,MAPE,MAE,RMSE
0,HOUSEHOLD_1_003_CA_3,0.973009,167.6569,144.1082,38.8569,1.197549,1.296732
1,HOUSEHOLD_1_004_CA_3,0.773650,60.4450,74.2493,58.6793,2.201927,2.916559
2,HOUSEHOLD_1_007_CA_1,1.254479,60.7940,56.9646,69.4241,0.868486,1.034870
3,HOUSEHOLD_1_007_CA_3,0.784855,94.7680,106.4419,70.9550,2.233818,2.454202
4,HOUSEHOLD_1_018_CA_3,inf,inf,200.0000,,0.027458,0.033641
...,...,...,...,...,...,...,...
381,HOUSEHOLD_2_483_CA_3,0.582124,52.2419,55.2642,46.5724,1.791152,2.663307
382,HOUSEHOLD_2_490_CA_2,inf,inf,200.0000,,0.003603,0.003718
383,HOUSEHOLD_2_490_CA_3,inf,inf,200.0000,,0.000795,0.001204
384,HOUSEHOLD_2_510_CA_3,0.665290,42.1450,48.9147,39.2371,1.023522,1.283685


In [80]:
df_forecast_metrics.to_csv('Random Forest_Smooth_Forecast_Metrics.csv')