In [14]:
import pandas as pd
import itertools 
import numpy as np
import pickle

azure_path = None #'/mnt/ddf/' # Azure Blob Storage
path = './' #r'/dbfs/FileStore/tables/' # databricks filesystem

model_names = ['sarimax', 'ets']
model_variables = ['forecast','mape','parameters']
model_selection_metric = 'mape'

##output_file = '/4_outputs/ddf_dashboard.csv'
output_file = '/4_outputs/ddf_dashboard.xlsx'

# Load Prediction File

In [15]:
# dbutils.fs.cp("dbfs:/FileStore/tables/ddf_forecast.xlsx", "/mnt/ddf/")

# IMPORT OF FORECAST
ddf = pd.read_excel(path + r'/4_outputs/ddf_predictions.xlsx')


# Load Configuration File

In [16]:
# IMPORT OF VARIABLES FROM MODELING NOTEBOOK
modeling = (path + r'/4_outputs/var_results_modeling.sav')
results = pickle.load(open(modeling, 'rb'))

# IMPORT OF DATA FROM FRAMEWORK CONFIGURATION
frameworkconfig1 = (path + r'/3_variables/var1_framework_config.sav')
ddf_config_var = pickle.load(open(frameworkconfig1, 'rb'))

frameworkconfig2 = (path + r'/3_variables/var2_framework_config.sav')
ddf_config_kpi = pickle.load(open(frameworkconfig2, 'rb'))

frameworkconfig3 = (path + r'/3_variables/var3_framework_config.sav')
ddf_config_par = pickle.load(open(frameworkconfig3, 'rb'))


# Output Scructure

TODO: build a generic way of adding new models to the output, instead of changing this cell everytime a new model is implemented in the solution.

In [17]:
n_levels = 5
granularity_cols = ['Level_%i'%n for n in range(1, n_levels+1)]
model_cols = [x+"_"+y for y in model_names for x in model_variables]
model_selection_cols = ['BestModelName', 'Forecast','MAPE_Forecast', 'BestModelParameters', 'MAPE_Reference', 'Reference']
mandatory_cols = ['DFU']

output_cols = ['Date'] + mandatory_cols + granularity_cols + ['Target'] + model_cols + model_selection_cols

# Get the Date columns from the configuration file
date_column = ddf_config_var.VariableName[(ddf_config_var.VariableType.str.lower() == 'datetime') & \
                                          (ddf_config_var.VariableUsage.str.lower().isin(['analysis','time_series']))].values.tolist()
if len(date_column) != 1:
    raise Exception('Could not find the date column in the configuration file')
print('Configured date column: ', date_column)

# Get the primary key definition (granularity)
pk_cols = ddf_config_var[ddf_config_var.PK > 0].sort_values('PK').VariableName.values.tolist()
if len(pk_cols) == 0:
    raise Exception('Could not find the PK columns in the configuration file')
elif len(pk_cols) > n_levels:
    raise Exception('Number of PK levels is greater than support by the tool (%i)'%n_levels)
print('Configured PK columns: ', pk_cols)

# Get the target definition
target_column = ddf_config_var.VariableName[ddf_config_var.VariableUsage.str.lower() == 'target'].values.tolist()
if len(target_column) != 1:
    raise Exception('Could not find the PK columns in the configuration file')
print('Configured target column: ', target_column)

# Get reference definition
ref_column = ddf_config_var.VariableName[(ddf_config_var.VariableType.str.lower() == 'numeric') & \
                                          (ddf_config_var.VariableUsage.str.lower() == 'reference')].values.tolist()
if len(ref_column) != 1:
    print('Could not find a reference column for the forecast. Adding a dummy one')
    ref_column = ['DUMMY_REFERENCE']
    ddf[ref_column[0]] = 0
    ddf['MAPE_'+ref_column[0]] = np.nan
else:
    print('Configured forecast reference column: ', ref_column[0])



Configured date column:  ['Date']
Configured PK columns:  ['Unidade de Negocio', 'Product']
Configured target column:  ['Volume (Un)']
Could not find a reference column for the forecast. Adding a dummy one


# Convert the Output to the Expected Structure

In [18]:
# Renaming mapping
renaming= {
    date_column[0]: 'Date',
    target_column[0] : 'Target',
    ref_column[0]: 'Reference',
    'MAPE_'+ref_column[0]: 'MAPE_Reference'
}
renaming.update(dict(zip(pk_cols, granularity_cols[:len(pk_cols)])))



results = ddf.copy()
results.rename(columns=renaming, inplace=True)
results = results[np.intersect1d(output_cols, results.columns)]
results[mandatory_cols] = ddf[mandatory_cols]

# default levels
for cname in granularity_cols[len(pk_cols):]:
    results[cname] = 'N/A'

# Best models
scores = np.array([model_selection_metric + "_" + x for x in model_names])
predictions = np.array(["forecast_" + x for x in model_names])

results.loc[:,'BestModelName'] = np.array(model_names)[np.argmin(results[scores].values, axis=1)]
results.loc[results[predictions[0]].isnull(), 'BestModelName'] = 'N/A'

results['Forecast'] = [results['forecast_' + cname].values[i] if cname != 'N/A' else np.nan
                       for i, cname in enumerate(results.BestModelName.values)]
results['MAPE_Forecast'] = [results['mape_' + cname].values[i] if cname != 'N/A' else np.nan
                            for i, cname in enumerate(results.BestModelName.values)]
results['BestModelParameters'] = [results['parameters_' + cname].values[i] if cname != 'N/A' else np.nan
                                  for i, cname in enumerate(results.BestModelName.values)]
# Fix date format
results['Date'] = pd.to_datetime(results.Date, format=ddf_config_var[ddf_config_var.VariableName == date_column[0]].Obs.values[0])
results['Date'] = results.Date.dt.strftime('%Y-%m-%d')

# Export Results

In [19]:
# print(output_file)
##results.to_csv(path + output_file, index=None, sep=';', encoding='latin1')
results.to_excel(path + output_file, index=None, encoding='latin1')

In [20]:
if azure_path is not None:
    dbutils.fs.cp(path.replace('/dbfs/','dbfs:/') + output_file, azure_path + '/4_outputs/' + output_file)