In [8]:
import pandas as pd
import pickle
import numpy as np
import ast

from itertools import product
from joblib import Parallel
from multiprocessing import cpu_count
from joblib import delayed
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.stattools import adfuller
# from arch.unitroot import PhillipsPerron


epsilon = 0.1

In [9]:
# PATH WHERE THE FILES ARE SAVED
azure_path = None #'/mnt/ddf/' # Azure Blob Storage
path = r'./'

# IMPORT OF DATABASE
ddf = pd.read_excel(path + r'/2_cleaned_data/clean_ddf.xlsx')
# ddf = pd.read_csv(path + r'clean_ddf.csv',sep=',')

# IMPORT OF DATA FROM FRAMEWORK CONFIGURATION
frameworkconfig1 = (path + r'/3_variables/var1_framework_config.sav')
ddf_config_var = pickle.load(open(frameworkconfig1, 'rb'))

# frameworkconfig2 = (path + r'/3_variables/var2_framework_config.sav')
# ddf_config_kpi = pickle.load(open(frameworkconfig2, 'rb'))

frameworkconfig3 = (path + r'/3_variables/var3_framework_config.sav')
ddf_config_par = pickle.load(open(frameworkconfig3, 'rb'))

In [10]:
# PRIMARY KEY
primary_key = ddf_config_var.loc[np.where(ddf_config_var["VariableUsage"]=='PRIMARY')[0][0],'VariableName']

# FILTER BY COLUMN
filter_by = ddf_config_par.loc[0,'Value']

# FILTER BY VALUE
filter_value = ddf_config_par.loc[1,'Value']

if str(filter_by)!='nan':
    # PRIMARY KEY LIST WITH FILTER 
    pk_list = ddf[primary_key][ddf[filter_by]==filter_value].unique()
else:
    # PRIMARY KEY LIST 
    pk_list = ddf[primary_key].unique()
    
# DATETIME VARIABLE
date_column = ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0][0],'VariableName']

# TARGET VARIABLE
target = ddf_config_var.loc[np.where(ddf_config_var["VariableUsage"]=='TARGET')[0][0],'VariableName']

# EXOGENOUS VARIABLE
exogenous = []
for i in ddf_config_var.loc[np.where(np.logical_and(ddf_config_var["VariableUsage"]=='EXOGENOUS',ddf_config_var["VariableType"]=='NUMERIC'))[0],'VariableName']:
    exogenous.append(i)
for i in ddf_config_var.loc[np.where(np.logical_and(ddf_config_var["VariableUsage"]=='EXOGENOUS',ddf_config_var["VariableType"]=='CATEGORICAL'))[0],'VariableName']:
    exogenous.append(i)

# TRAINING PERCENTAGE
ptrain = pd.to_numeric(ddf_config_par.loc[2,'Value'])

In [11]:
# FORMAT OF DATETIME
form = ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0],'Obs'][0]

# DATETIME FREQUENCY
if ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0],'UpdateFrequency'][0]=='days':
    freq = 'D'
elif ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0],'UpdateFrequency'][0]=='weeks':
    freq = 'W'
elif ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0],'UpdateFrequency'][0]=='months':
    freq = 'M'
elif ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0],'UpdateFrequency'][0]=='years':
    freq = 'Y'

# SET OF PRIMARY KEYS WITH COMPLETENESS
ddf_completeness = ddf[[date_column,primary_key]]
ddf_completeness[date_column] = pd.to_datetime(ddf_completeness[date_column], format=form)
pk_completeness = ddf_completeness[[date_column,primary_key]].groupby(primary_key).agg(['min','max','count'])
pk_completeness.columns = ['FirstDate','LastDate','Total']
pk_completeness['ExpectedLength'] = pd.to_datetime(pk_completeness['LastDate'], format=form)-pd.to_datetime(pk_completeness['FirstDate'], format=form)
pk_completeness['ExpectedLength'] = round(pk_completeness['ExpectedLength']/np.timedelta64(1,freq)+1,0)
pk_completeness2 = set(pk_completeness.index[np.where(pk_completeness['Total']==pk_completeness['ExpectedLength'])])
non_completeness = set(pk_list).difference(pk_completeness2)
pk_list = np.array(list(set(pk_list).intersection(pk_completeness2)))
pk_list.sort()

if len(non_completeness)==0:
    print('There are no series with problems of completeness')
else:
    print('The following primary keys have completeness problems:', non_completeness)

There are no series with problems of completeness


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Sort Values (must) 

In [12]:
ddf.sort_values([primary_key, date_column], inplace=True)

In [13]:
# MULTI-STEP SARIMAX FORECAST
def sarima_forecast(history, exog, config, forecast):
    p,d,q,P,D,Q,m,t = config
    if exog.shape[1]==0:
        # define model
        try:
            model = SARIMAX(endog=np.array(history), order=(p,d,q), seasonal_order=(P,D,Q,m), trend=t, enforce_stationarity=False, enforce_invertibility=False)
            # fit model
            model_fit = model.fit(disp=False)
            # make one step forecast
            yhat = model_fit.forecast(steps=forecast)
        except Exception as err:
            print(err)
    else:
        # define model
        model = SARIMAX(endog=np.array(history), exog=np.array(exog), order=(p,d,q), seasonal_order=(P,D,Q,m), trend=t, enforce_stationarity=False, enforce_invertibility=False)
        # fit model
        model_fit = model.fit(disp=False)
        exog_post = pd.DataFrame(index=range(forecast))
        for i in range(0,exog.shape[1]):
            if isinstance(exog.loc[exog.index[0],exogenous[i]], float)==True:
                exog_post = pd.concat([exog_post, pd.DataFrame(np.repeat(np.matrix(np.mean(exog.iloc[:,i])),forecast,axis=0))], axis=1)
            else:
                exog_post = pd.concat([exog_post, pd.DataFrame(np.repeat(np.matrix(exog.iloc[:,i].mode()),forecast,axis=0))[0]], axis=1)
        # make multi-step forecast
        yhat = model_fit.forecast(steps=forecast, exog=exog_post)
    return model_fit, yhat

# SPLIT A DATASET INTO TRAIN/TEST SETS
def train_test_split(data, p_train):
    return np.asarray(data)[range(0,int(round(p_train*len(data),0))+1)], np.asarray(data)[range(int(round(p_train*len(data),0)),len(data))]


# MAPE ERROR CALCULATION
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    y_true[y_true==0] = epsilon
    return round(np.mean(np.abs((np.array(y_true)-np.array(y_pred))/np.array(y_true)))*100,2)

# WALK-FORWARD VALIDATION
def walk_forward_validation_sarimax(data, p_train, cfg):
    # split dataset
    train, test = train_test_split(data, p_train)
    # fit model and make forecast for history
    predictions = sarima_forecast(train, exog[0:len(train)], cfg, len(test))[1]
    # estimate prediction error
    error = mean_absolute_percentage_error(test, predictions)
    return error

# SCORE A MODEL
def score_model_sarimax(data, p_train, cfg, debug=False):
    # show all warnings and fail on exception if debugging
    if debug:
        result = walk_forward_validation_sarimax(data, p_train, cfg)
    else:
        # one failure during model validation suggests an unstable config
        try:
            # never show warnings when grid searching, too noisy
            with catch_warnings():
                filterwarnings("ignore")
                result = walk_forward_validation_sarimax(data, p_train, cfg)
        except Exception as err:
            result = None
    # check for an interesting result
    if result is not None:
        print('Model[%s] -> %.5f' % (cfg, result))
    return (cfg, result)


# GRID SEARCH CONFIGS
def grid_search(data, cfg_list, p_train, score_model, parallel=False):
    scores = None
    if parallel:
        # execute configs in parallel
        executor = Parallel(n_jobs=cpu_count(), verbose = 10)
        tasks = (delayed(score_model)(data, p_train, cfg) for cfg in cfg_list)
        scores = executor(tasks)
    else:
        scores = [score_model(data, p_train, cfg) for cfg in cfg_list]
    # remove empty results
    scores = [r for r in scores if r[1] != None and str(r[1]) != 'nan']
    # sort configs by error, asc
    scores.sort(key=lambda tup: tup[1])
    return scores


# CREATE SET OF SARIMAX CONFIGS TO TRY
def sarima_configs():
    # define config lists
    if isinstance(ddf_config_par.loc[3,'Value'], int):
        p_params = np.arange(pd.to_numeric(ddf_config_par.loc[3,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[3,'Value']), list):
        p_params = ast.literal_eval(ddf_config_par.loc[3,'Value']).tolist()
    if isinstance(ddf_config_par.loc[4,'Value'], int):
        d_params = np.arange(pd.to_numeric(ddf_config_par.loc[4,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[4,'Value']), list):
        d_params = ast.literal_eval(ddf_config_par.loc[4,'Value']).tolist()
    if isinstance(ddf_config_par.loc[5,'Value'], int):
        q_params = np.arange(pd.to_numeric(ddf_config_par.loc[5,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[5,'Value']), list):
        q_params = ast.literal_eval(ddf_config_par.loc[5,'Value']).tolist()
    if isinstance(ddf_config_par.loc[6,'Value'], int):
        P_params = np.arange(pd.to_numeric(ddf_config_par.loc[6,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[6,'Value']), list):
        P_params = ast.literal_eval(ddf_config_par.loc[6,'Value']).tolist()
    if isinstance(ddf_config_par.loc[7,'Value'], int):
        D_params = np.arange(pd.to_numeric(ddf_config_par.loc[7,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[7,'Value']), list):
        D_params = ast.literal_eval(ddf_config_par.loc[7,'Value']).tolist()
    if isinstance(ddf_config_par.loc[8,'Value'], int):
        Q_params = np.arange(pd.to_numeric(ddf_config_par.loc[8,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[8,'Value']), list):
        Q_params = ast.literal_eval(ddf_config_par.loc[8,'Value']).tolist()
    if isinstance(ddf_config_par.loc[9,'Value'], int):
        m_params = np.arange(pd.to_numeric(ddf_config_par.loc[9,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[9,'Value']), list):
        m_params = ast.literal_eval(ddf_config_par.loc[9,'Value'])
    t_params = ast.literal_eval(ddf_config_par.loc[10,'Value'])
    # create config instances
    models = set(list(product(p_params,d_params,q_params,P_params,D_params,Q_params,m_params,t_params)))
    return models

# DATE OF TEST
def date_test(prod, p_train):
    result = tuple(np.array(ddf[date_column][ddf[primary_key]==prod])[range(int(round(p_train*len(ddf[date_column][ddf[primary_key]==prod]),0)),len(ddf[date_column][ddf[primary_key]==prod]))])
    return result
    
# FORECAST OF TEST
def forecast_test(data, p_train):
    result = tuple(sarima_forecast(train_test_split(data, p_train)[0], exog[0:len(train_test_split(data, p_train)[0])],scores[0][0], len(train_test_split(data, p_train)[1]))[1])
    return result

# HETEROSCEDASTICITY TEST
def test_heteroscedasticity(data,p_train):
    model_fit = sarima_forecast(train_test_split(data, p_train)[0], exog[0:len(train_test_split(data, p_train)[0])], scores[0][0], 1)[0]
    if model_fit.test_heteroskedasticity(None)[0][1]<0.05:
        result = 'heteroscedastic'
    else:
        result = 'homoscedastic'
    return result

# AUGMENTED DICKEY-FULLER TEST (HIPÓTESIS NULA: SERIE NO ESTACIONARIA)
def dickey_fuller(data,p_train):
    if adfuller(train_test_split(data, p_train)[0])[1]<0.05:
        result = 'stationary'
    else:
        result = 'non-stationary'
    return result

# PHILLIPS-PERRON TEST
def phillips_perron(data,p_train):
    try:
        if PhillipsPerron(train_test_split(data, p_train)[0]).pvalue<0.05:
            result = 'stationary'
        else:
            result = 'non-stationary'
    except:
        result = 'not-applicable'
    return result

In [14]:
# CONFIGURATIONS TO TEST
cfg_list = sarima_configs()
# cfg_list = set(list(cfg_list)[:4])
sarimax_grid_search = list()

for pro in pk_list:
    # HISTORICAL DATA
    data = ddf[target][ddf[primary_key]==pro]
    # EXOGENOUS VARIABLE DATA
    exog = ddf[exogenous][ddf[primary_key]==pro]
    # GRID SEARCH
    scores = grid_search(data, cfg_list, ptrain, score_model_sarimax, True)
    print('finished primary key',pro)
    # SAVE TOP CONFIG BY PK
    # sarimax_grid_search.append((pro,)+scores[0]+(date_test(pro,ptrain),)+(forecast_test(data,ptrain),)+(test_heteroscedasticity(data,ptrain),)+(dickey_fuller(data,ptrain),)+(phillips_perron(data,ptrain),))
    sarimax_grid_search.append((pro,)+scores[0]+(date_test(pro,ptrain),)+(forecast_test(data,ptrain),)+(test_heteroscedasticity(data,ptrain),)+(dickey_fuller(data,ptrain),))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    6.8s remaining:    6.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


finished primary key Retail1


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail10


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0862s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0650s.) Setting batch_size=6.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail11


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


finished primary key Retail12


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1434s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


finished primary key Retail13


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1092s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail14


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1366s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail15


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


finished primary key Retail16


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0830s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s


finished primary key Retail17


[Parallel(n_jobs=4)]: Batch computation too fast (0.1084s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail18


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1100s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1248s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail19


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail2


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1128s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1096s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1414s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


finished primary key Retail5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1258s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


finished primary key Retail6


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1050s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail7
finished primary key Retail8


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


In [32]:
# MULTI-STEP HOLT WINTER'S EXPONENTIAL SMOOTHING FORECAST
def ets_forecast(history, config, forecast):
	t,d,s,p,b,r = config
	# define model model
	model = ExponentialSmoothing(np.array(history), trend=t, damped=d, seasonal=s, seasonal_periods=p)
	# fit model
	model_fit = model.fit(optimized=True, use_boxcox=b, remove_bias=r)
	# make multi step forecast
	yhat = model_fit.forecast(steps=forecast)
	return yhat

# WALK-FORWARD VALIDATION
def walk_forward_validation_ets(data, p_train, cfg):
	# split dataset
	train, test = train_test_split(data, p_train)
    # fit model and make forecast for history
	predictions = ets_forecast(train, cfg, len(test))
	# estimate prediction error
	error = mean_absolute_percentage_error(test, predictions)
	return error

# SCORE A MODEL
def score_model_ets(data, p_train, cfg, debug=False):
	# show all warnings and fail on exception if debugging
	if debug:
		result = walk_forward_validation_ets(data, p_train, cfg)
	else:
		# one failure during model validation suggests an unstable config
		try:
			# never show warnings when grid searching, too noisy
			with catch_warnings():
				filterwarnings("ignore")
				result = walk_forward_validation_ets(data, p_train, cfg)
		except:
			result = None
	# check for an interesting result
	if result is not None:
		print('Model[%s] -> %.5f' % (cfg, result))
	return (cfg, result)


# CREATE SET OF SARIMA CONFIGS TO TRY
def ets_configs():
    # define config lists
    t_params = ast.literal_eval(ddf_config_par.loc[11,'Value'])
    d_params = ast.literal_eval(ddf_config_par.loc[12,'Value'])
    s_params = ast.literal_eval(ddf_config_par.loc[13,'Value'])
    if isinstance(ddf_config_par.loc[14,'Value'], int):
        p_params = np.arange(pd.to_numeric(ddf_config_par.loc[14,'Value'])+1)
    elif isinstance(ast.literal_eval(ddf_config_par.loc[14,'Value']), list):
        p_params = ast.literal_eval(ddf_config_par.loc[14,'Value'])
    b_params = ast.literal_eval(ddf_config_par.loc[15,'Value'])
    r_params = ast.literal_eval(ddf_config_par.loc[16,'Value'])
    # create config instances
    models = set(list(product(t_params,d_params,s_params,p_params,b_params,r_params)))
    return models

# FORECAST OF TEST
def forecast_test(data, p_train):
    result = tuple(ets_forecast(train_test_split(data, p_train)[0], scores[0][0], len(train_test_split(data, p_train)[1])))
    return result

In [33]:
# CONFIGURATIONS TO TEST
cfg_list = ets_configs()
# cfg_list = set(list(ets_configs())[:5])
ets_grid_search = list()

for pro in pk_list:
    # HISTORICAL DATA
    data = ddf[target][ddf[primary_key]==pro]
    # GRID SEARCH
    scores = grid_search(data, cfg_list, ptrain, score_model_ets, True)
    print('finished primary key',pro)
    # SAVE TOP CONFIG BY PK
    ets_grid_search.append((pro,)+scores[0]+(date_test(pro,ptrain),)+(forecast_test(data,ptrain),))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0970s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   10.3s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   10.3s finished


finished primary key Retail1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0810s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1092s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail10


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1248s.) Setting batch_size=2.


finished primary key Retail11


[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    8.9s finished


finished primary key Retail12


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1092s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0312s.) Setting batch_size=12.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail13


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    0.6s finished


finished primary key Retail14


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail15


[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0780s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail16


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.1s finished


finished primary key Retail17


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.1s finished


finished primary key Retail18


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0624s.) Setting batch_size=6.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0834s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail19


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    8.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.1s remaining:    0.0s


finished primary key Retail2


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.6s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.6s finished


finished primary key Retail3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1092s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.4s finished


finished primary key Retail4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0780s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s finished


finished primary key Retail5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0680s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.8s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0936s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s


finished primary key Retail6


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    9.0s finished


finished primary key Retail7


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1092s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    8.9s finished


finished primary key Retail8


In [34]:
# SUMMARY OF DATAFRAME (SARIMAX)
sarimax_results = pd.DataFrame(sarimax_grid_search)
# sarimax_results.columns = [primary_key,'parameters_sarimax','mape_sarimax',date_column,'forecast_sarimax','heteroscedasticity test','dickey-fuller test','phillip-perron test']
sarimax_results.columns = [primary_key,'parameters_sarimax','mape_sarimax',date_column,'forecast_sarimax','heteroscedasticity test','dickey-fuller test']

# SUMMARY OF DATAFRAME (ETS)
ets_results = pd.DataFrame(ets_grid_search)
ets_results.columns = [primary_key,'parameters_ets','mape_ets',date_column,'forecast_ets']

# JOIN OF DATAFRAMES
merged_results = pd.merge(sarimax_results, ets_results, how='left', left_on = [primary_key,date_column], right_on = [primary_key,date_column])
# merged_results = sarimax_results

# FINAL DATAFRAME

prod = []
par_sar = []
sar_mape = []
for_sar = []
par_es = []
es_mape = []
for_es = []
date = []
het_test = []
dickey_test =[]
# phil_test = []

cont = 0
for i in merged_results['forecast_sarimax']:
    array = np.array(i)
    for j in array:
        prod.append(merged_results[primary_key][cont])
        par_sar.append(merged_results['parameters_sarimax'][cont])
        sar_mape.append(merged_results['mape_sarimax'][cont])
        es_mape.append(merged_results['mape_ets'][cont])
        par_es.append(merged_results['parameters_ets'][cont])
        het_test.append(merged_results['heteroscedasticity test'][cont])
        dickey_test.append(merged_results['dickey-fuller test'][cont])
        # phil_test.append(merged_results['phillip-perron test'][cont])
      
        for_sar.append(j)
    cont +=1    

for i in merged_results[date_column]:
    array = np.array(i)
    for j in array:
        date.append(j)
        
for i in merged_results['forecast_ets']:
    array = np.array(i)
    for j in array:
        for_es.append(j) 
        
results = pd.DataFrame(prod)
results.columns = [primary_key]
results[date_column] = date
results['parameters_sarimax'] = par_sar
results['mape_sarimax'] = sar_mape
results['heteroscedasticity test'] = het_test
# results['phillip-perron test'] = phil_test
results['dickey-fuller test'] = dickey_test
results['forecast_sarimax'] = for_sar
results['parameters_ets'] = par_es
results['mape_ets'] = es_mape
results['forecast_ets'] = for_es

# MERGE PREDCTIONS RESULT WITH DDF DATAFRAME
result_final = pd.merge(ddf, results, how='left', left_on = [primary_key,date_column], right_on = [primary_key,date_column])

In [35]:
# EXCEL WITH DATA OF MODELS
writer = pd.ExcelWriter(path + r'/4_outputs/ddf_predictions.xlsx')
result_final.to_excel(writer,'Sheet1')
writer.save()

# OUTPUT MODEL   
modeling = (path + r'/4_outputs/var_results_modeling.sav')
pickle.dump(results, open(modeling, 'wb'))

In [36]:
if azure_path is not None:
    dbutils.fs.cp(path.replace('/dbfs/','dbfs:/') + "/ddf_predictions.xlsx", azure_path + '/4_outputs/ddf_predictions.xlsx')
    dbutils.fs.cp(path.replace('/dbfs/','dbfs:/') + "/var_results_modeling.sav", azure_path + '/3_variables/var_results_modeling.sav')