## Meteo Bakery - Presentation Figures
In this notebook, we will generate figures for presentation

### import libraries

In [None]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
import os

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit

from lightgbm import LGBMRegressor
import shap

### load data

In [None]:
df = pd.read_csv('../data/data_combined.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.head()

### transform periodic month feature using sine and cosine functions

In [None]:
df['month_sin'] = df.month.apply(lambda x: np.sin(np.array(x) * np.pi /6))
df['month_cos'] = df.month.apply(lambda x: np.cos(np.array(x) * np.pi /6))

### select only years up to 2020

In [None]:
df = df[df.year<2020]

### generate lag features
Will we use sales with a lag of 7 and 365 days, since these days showed peaks in partical autocorrelation plots.

In [None]:
# utility function for generating lagged features
def get_lag_features(df, grouping_vars, feature, lags):
    """Takes in a stacked time series dataframe and generates lag features for defined lags and returns dataframe with lags as
    additional columns.

    Args:
        df (pd.DataFrame): Stacked time series dataframe
        grouping_vars (list): A list of grouping variables. Currently accepts only a list of two variables.
        feature (str): Name of the feature, for which lags should be generated.
        lags (list): A list of lags to generate lag features

    Returns:
        df (pd.DataFrae): A Dataframe containing the lag features as additional columns.
    """
    # initialize empty dataframe
    df_lag = pd.DataFrame({})
    
    for i, group in enumerate(product(df[grouping_vars[0]].unique(), df[grouping_vars[1]].unique())):
        # subselect time series and generate lag features
        ts = df[(df[grouping_vars[0]]==group[0]) & (df[grouping_vars[1]]==group[1])].copy()
        # map feature to dictionary
        target_map = ts[feature].to_dict()
        # iterate over every lag, map feature according to lag and append to dataframe
        for lag in lags:
            ts[f'{feature}_lag_{lag}'] = (ts.index - pd.Timedelta(f'{lag} days')).map(target_map)
            
        df_lag = pd.concat([df_lag, ts], axis=0)
    
    return df_lag

### function for generating lead features

In [None]:
# utility function for generating lead features
def get_lead_features(df, grouping_vars, feature, leads):
    """Takes in a stacked time series dataframe and generates lead features for defined leads and returns dataframe with leads as
    additional columns.

    Args:
        df (pd.DataFrame): Stacked time series dataframe
        grouping_vars (list): A list of grouping variables. Currently accepts only a list of two variables.
        feature (str): Name of the feature, for which leads should be generated.
        leads (list): A list of leads to generate lead features

    Returns:
        df (pd.DataFrae): A Dataframe containing the lead features as additional columns.
    """
    # initialize empty dataframe
    df_lead = pd.DataFrame({})
    
    for i, group in enumerate(product(df[grouping_vars[0]].unique(), df[grouping_vars[1]].unique())):
        # subselect time series and generate lead features
        ts = df[(df[grouping_vars[0]]==group[0]) & (df[grouping_vars[1]]==group[1])].copy()
        # map feature to dictionary
        target_map = ts[feature].to_dict()
        # iterate over every lead, map feature according to lead and append to dataframe
        for lead in leads:
            ts[f'{feature}_lead_{lead}'] = (ts.index + pd.Timedelta(f'{lead} days')).map(target_map)
            
        df_lead = pd.concat([df_lead, ts], axis=0)
    
    return df_lead

### generate lag features for turnover (lag 7, lag 365)

In [None]:
df_lag = get_lag_features(df, ['branch', 'product'], 'turnover', [7, 365])

In [None]:
df_lag[(df_lag.index >= pd.to_datetime('2012-12-20')) & (df_lag['branch']=='Metro') & (df_lag['product']=='Brown Bread')][['turnover', 'turnover_lag_7', 'turnover_lag_365']].head(10)

### generate lead features for weather (lead 1; temperature, rain, humidity)

In [None]:
df_lag = get_lead_features(df_lag, ['branch', 'product'], 'temp_mean', [1])
df_lag = get_lead_features(df_lag, ['branch', 'product'], 'rain_1h_mean', [1])
df_lag = get_lead_features(df_lag, ['branch', 'product'], 'snow_1h_mean', [1])

### replace missing values
Previous analyes showed that a couple of days are missing from the sales data. For the branch located at the Metro and Train Station, there is a total of 4 missing days. By contrast, 69 days are missing for Center branch in the years 2012-2019. They frequently fall on a public holiday, thus indicating that this branch probably had closed on these days.
We will first replace NaNs at Center branch by 1 if occuring on public holiday. Remaining NaNs will be replaced with turnover of corresponding day of preceding weak, otherwise, a forward fill will be used.

In [None]:
df_lag.groupby(['branch', 'product'])['turnover', 'month'].count()

In [None]:
df_repl = df_lag.copy()

# replace NaN at Center branch by 0 is occuring on public holiday
df_repl.loc[(df_repl['branch']=='Center') & (df_repl['public_holiday']==True), 'turnover'] = df_repl.loc[(df_repl['branch']=='Center') & (df_repl['public_holiday']==True), 'turnover'].fillna(1)

# fill NaN with sales from previous day of week
df_repl['turnover'] = df_repl['turnover'].fillna(df_repl['turnover_lag_7'])

# fill remaining NaN using forward fill
#df_repl['turnover'].ffill(inplace=True, axis='rows')
for i, group in enumerate(product(df_repl['branch'].unique(), df_repl['product'].unique())):
        df_repl[(df_repl['branch']==group[0]) & (df_repl['product']==group[1])].ffill(inplace=True, axis='rows')

In [None]:
# check if replacement worked properly for an example date with a missing
df_repl[(df_repl.index == pd.to_datetime('2012-02-22')) & (df_repl['branch']=='Train_Station')][['turnover', 'turnover_lag_7']]

In [None]:
df_repl.groupby(['branch', 'product'])[['turnover', 'turnover_lag_7', 'month']].count()

### generate train and test df

In [None]:
df_train = df_repl[df_repl.year<2018]
df_test = df_repl[df_repl.year>=2018]

### Naive Seasonal baseline
We will first define a utility function to perform cross-validation on naive seasonal baseline.

#### use sales 14 days ago if prediction contains missing from closed Center branch on public holidays
It is possible that a prediction for Naive Seasonal baseline includes values == 1 as prediction, which reflected NaN from public holidays in the preceding 7 day-interval used for prediction in case of Center branch.

In such as case, replace any NaN (currently encoded as 1) by the turnover day from exactly 7 days ago. Thus, in case a holiday is contained in the preceding week before prediction, any such holiday is replaced by sales data from the day of the preceding week. Thus, not the sales 7 days before before are used as a prediction when falling on a holiday, but instead the sales 14 days ago.

Similarly, it is possible that observations in the validation set contain NaNs. We will extract index positions for such events and then delete them from both validation set and predicted values before computing MAPE score.

In [None]:
# utility function for naive seasonal baseline corrected for holiday effects
def crossval_naive(df_train, grouping_vars, target, splits=52, test_size=7, gap=0):
    """Cross-Validation for Naive Seasonal Baseline:
    Takes in a training dataset of stacked time series and performs TimeSeriesSplit Cross-Validation for Naive Seasonal 
    baseline model for each of those time series. Returns a dataframe of cross-validation results containing mean MAPE scores 
    and corresponding standard deviations from cross-validation of each individual time series.

    Args:
        df_train (pd.DataFrame): A training dataframe containing stacked time series data
        grouping_vars (list): A list of grouping variables, according to which training data is stacked. Currently accepts only a list of two variables.
        target (str): Prediction target
        splits (int, optional): Number of splits for Cross-Validation. Defaults to 52 (1 fold / week).
        test_size (int, optional): Size of validation set (i.e. forecasting horizon). Defaults to 7 days.
        gap (int, optional): Time gap between end of training and start of validation set. Defaults to 0.

    Returns:
        pd.DataFrame: A dataframe with cross-validation results, containing mean MAPE scores and respective standard deviations for each time series
    """
    # initialize dataframe for evaluation scores
    cval = pd.DataFrame({'group': [], 'MAPE_mean': [], 'MAPE_std': []})

    # iterate over all individual series and perform cross-validation
    for i, group in enumerate(product(df_train[grouping_vars[0]].unique(), df_train[grouping_vars[1]].unique())):

        # subselect time series
        ts = df_train[(df_train[grouping_vars[0]]==group[0]) & (df_train[grouping_vars[1]]==group[1])].copy()
        
        # perform cross validation
        tss = TimeSeriesSplit(n_splits=splits, test_size=test_size, gap=gap)
        # initialize scores list for append MAPE scores from individual folds and start cross-validation
        scores = []
        for train_i, val_i in tss.split(ts):

            y_train = ts.iloc[train_i][target]
            y_val = ts.iloc[val_i][target]
            
            # correct for holiday effects in predicted values based on training set if necessary
            # if 1 (representing missing values on a holiday) is in prediced y-values, replace by sales 14 days ago
            if 1 in y_train[-7:].unique():
                idx_train = [i for i in range(len(y_train[-7:].tolist())) if y_train[-7:].tolist()[i]==1]
                idx_train = [i-7 for i in idx_train]
                idx_train_lag = [i-7 for i in idx_train]
                y_train_repl = y_train.copy()
                y_train_repl.iloc[idx_train] = [x for x in y_train.iloc[idx_train_lag]]
                y_pred = y_train_repl[-7:]
            else:
                y_pred = y_train[-7:]

            # correct for holiday effects in validation set if necessary
            # if 1 (representing missing values on a holiday) is in validation set, drop elements at corresponding index position in both y_val and y_pred
            if 1 in y_val.unique():
                idx_val = [i for i in range(len(y_val.tolist())) if y_val.tolist()[i]==1]
                y_val = y_val.drop(y_val.index[idx_val])
                y_pred = y_pred.drop(y_pred.index[idx_val])

            mape = mean_absolute_percentage_error(y_val, y_pred)
            scores.append(mape)
        
        # append mean MAPE scores and standart deviations overall all cross-validation folds per time series to dataframe
        cval.loc[i, 'group'] = f'{group[0]} | {group[1]}'
        cval.loc[i, 'MAPE_mean'] = np.mean(scores)
        cval.loc[i, 'MAPE_std'] = np.std(scores)
    # calculate mean scores over all time series
    cval.loc[i+1, 'group'] = 'mean'
    cval.loc[i+1, 'MAPE_mean'] = cval['MAPE_mean'].mean()
    cval.loc[i+1, 'MAPE_std'] = cval['MAPE_std'].mean()

    return cval

In [None]:
naive = crossval_naive(df_train, grouping_vars=['branch', 'product'], target='turnover')

In [None]:
naive

### LightGBM
We will iterate over every time series and evaluate LightGBM performance with only temporal and additional weather features using TimeSeriesSplit Cross-Validation.

First, we will define a utility function performing Cross-Validation.

In [None]:
# utility function for LightGBM
def crossval_lgbm(df_train, grouping_vars, target, features, lgbm_kwargs=None, splits=52, test_size=7, gap=0):
    """Cross-Validation for LightGBM model:
    Takes in a training dataset of stacked time series and performs TimeSeriesSplit Cross-Validation for LightGBM model 
    for each of those time series. Returns a dataframe of cross-validation results containing mean MAPE scores 
    and corresponding standard deviations from cross-validation of each individual time series.

    Args:
        df_train (pd.DataFrame): A training dataframe containing stacked time series data
        grouping_vars (list): A list of grouping variables, according to which training data is stacked. Currently accepts only a list of two variables.
        target (str): Prediction target
        features (list): List of feature names to be used for training the model
        lgbm_kwargs (dict, optional): Dictionary of LGBM hyperparameters. Defaults to None. If None, model is trained using default hyperparameters.
        splits (int, optional): Number of splits for Cross-Validation. Defaults to 52 (1 fold / week).
        test_size (int, optional): Size of validation set (i.e. forecasting horizon). Defaults to 7 days.
        gap (int, optional): Time gap between end of training and start of validation set. Defaults to 0.

    Returns:
        pd.DataFrame: A dataframe with cross-validation results, containing mean MAPE scores and respective standard deviations for each time series
    """
    # initialize dataframe for evaluation scores
    cval = pd.DataFrame({'group': [], 'MAPE_mean': [], 'MAPE_std': []})

    # iterate over all individual series and perform cross-validation
    from itertools import product
    for i, group in enumerate(product(df_train[grouping_vars[0]].unique(), df_train[grouping_vars[1]].unique())):

        # subselect time series
        ts = df_train[(df_train[grouping_vars[0]]==group[0]) & (df_train[grouping_vars[1]]==group[1])].copy()

        # perform cross validation
        tss = TimeSeriesSplit(n_splits=splits, test_size=test_size, gap=gap)
        # initialize scores list for append MAPE scores from individual folds and start cross-validation
        scores = []
        for train_i, val_i in tss.split(ts):

            train = ts.iloc[train_i]
            val = ts.iloc[val_i]

            # generate target and feature vectors
            X_train = train[features]
            X_val = val[features]
            y_train = train[target]
            y_val = val[target]

            # initialize model
            if lgbm_kwargs==None:
                lgbm = LGBMRegressor(objective='regression', random_state=42)
            else:
                lgbm = LGBMRegressor(objective='regression', random_state=42, **lgbm_kwargs)
            # train model
            lgbm.fit(X_train, y_train)
            # predict
            y_pred= pd.Series(lgbm.predict(X_val))

            # correct for holiday effects in validation set if necessary
            # if 1 (representing missing values on a holiday) is in validation set, drop elements at corresponding index position in both y_val and y_pred
            if 1 in y_val.unique():
                idx_val = [i for i in range(len(y_val.tolist())) if y_val.tolist()[i]==1]
                y_val = y_val.drop(y_val.index[idx_val])
                y_pred = y_pred.drop(y_pred.index[idx_val])

            mape = mean_absolute_percentage_error(y_val, y_pred)
            scores.append(mape)
        
        # append mean MAPE scores and standart deviations overall all cross-validation folds per time series to dataframe
        cval.loc[i, 'group'] = f'{group[0]} | {group[1]}'
        cval.loc[i, 'MAPE_mean'] = np.mean(scores)
        cval.loc[i, 'MAPE_std'] = np.std(scores)
    # calculate mean scores over all time series
    cval.loc[i+1, 'group'] = 'mean'
    cval.loc[i+1, 'MAPE_mean'] = cval['MAPE_mean'].mean()
    cval.loc[i+1, 'MAPE_std'] = cval['MAPE_std'].mean()
    
    return cval

In [None]:
# define hyperparameters based on previous gridsearch results
params_optimal = {
    'boosting_type': 'dart',
    'n_estimators': 100,
    'learning_rate': 0.1
}

In [None]:
# define features sets
time_features = ['turnover_lag_7', 'turnover_lag_365', 'month_sin', 'month_cos', 'day_of_week', 'school_holiday', 'public_holiday']

weather_features = ['turnover_lag_7', 'turnover_lag_365', 'month_sin', 'month_cos', 'day_of_week', 'school_holiday', 'public_holiday',
                                    'temp_mean', 'humidity_mean', 'rain_1h_mean', 'snow_1h_mean',
                                                    'day_frosty', 'day_thunder', 'day_clear','day_hazy', 'day_summer',
                                                    'temp_mean_dev', 'humidity_mean_dev', 'pressure_mean_dev', 'rain_1h_mean_dev', 'snow_1h_mean_dev',
                                                    'temp_mean_change', 'pressure_mean_change', 'humidity_mean_change',
                                                    'temp_mean_lead_1', 'rain_1h_mean_lead_1', 'snow_1h_mean_lead_1']

In [None]:
lgbm_time = crossval_lgbm(df_train, grouping_vars=['branch', 'product'], target='turnover', features=time_features,
                            lgbm_kwargs=params_optimal)

In [None]:
lgbm_weather = crossval_lgbm(df_train, grouping_vars=['branch', 'product'], target='turnover', features=weather_features,
                            lgbm_kwargs=params_optimal)

In [None]:
lgbm_time

In [None]:
lgbm_weather

### merge cross-validation results

In [None]:
scores_merged = pd.concat([naive, lgbm_time[['MAPE_mean', 'MAPE_std']], lgbm_weather[['MAPE_mean', 'MAPE_std']]], axis=1)
scores_merged.columns = ['group', 'MAPE_mean_naive', 'MAPE_std_naive', 'MAPE_mean_lgbm_time', 'MAPE_std_lgbm_time',
                            'MAPE_mean_lgbm_weather', 'MAPE_std_lgbm_weather']

scores_merged

In [None]:
# save cross-validation results
scores_merged.to_csv('../models/lgbm_optimized/CV_baseline_lgbm.csv', index=False)

In [None]:
scores_grouped = pd.DataFrame(scores_merged[['group', 'MAPE_mean_naive', 'MAPE_mean_lgbm_time', 'MAPE_mean_lgbm_weather']].set_index('group').stack().reset_index().iloc[:-3, :])
scores_grouped.columns = ['group', 'model', 'MAPE_mean']
scores_grouped['MAPE_std'] = pd.DataFrame(scores_merged[['group',  'MAPE_std_naive', 'MAPE_std_lgbm_time', 'MAPE_std_lgbm_weather']].set_index('group').stack().reset_index().iloc[:-3, :])[0]
scores_grouped['model'] = [x.split('_')[-1] for x in scores_grouped['model']]
scores_grouped[['MAPE_mean', 'MAPE_std']] = scores_grouped[['MAPE_mean', 'MAPE_std']] *100

In [None]:
scores_grouped.tail()

In [None]:
# extract branch and product information as separate columns
scores_grouped['branch'] = [x.split(' | ')[0] for x in scores_grouped['group']]
scores_grouped['product'] = [x.split(' | ')[1] for x in scores_grouped['group']]
scores_grouped.head()

### plot mean MAPE and standard deviation from cross-validation over all groups

In [None]:
fig = plt.figure(figsize=(3,3))
#fig.patch.set_visible(False)
sns.barplot(data=scores_grouped, x='model', y='MAPE_mean', palette=['#96ed89', '#4192d9', '#2c1dff'], edgecolor='black', errwidth=0)
plt.ylabel('Average prediction error [%]', fontsize=12)
plt.yticks(np.arange(0, 26, 5), fontsize=11)
plt.xlabel(None)
plt.xticks(ticks=np.arange(0, 3), labels=['Baseline', 'LightGBM time', 'LightGBM weather'], fontsize=12, rotation=45, ha='right');
plt.title('Model Comparison', fontsize=14)
plt.show()

In [None]:
fig = plt.figure(figsize=(3,3))
#fig.patch.set_visible(False)
sns.barplot(data=scores_grouped, x='model', y='MAPE_std', palette=['#96ed89', '#4192d9', '#2c1dff'], edgecolor='black', errwidth=0)
plt.ylabel('Variability of prediction error [%]', fontsize=12)
plt.yticks(np.arange(0, 26, 5), fontsize=11)
plt.xlabel(None)
plt.xticks(ticks=np.arange(0, 3), labels=['Baseline', 'LightGBM time', 'LightGBM weather'], fontsize=12, rotation=45, ha='right');
plt.title('Model Comparison', fontsize=14)
plt.show()

### plot scores separaly for each branch

In [None]:
fig = plt.figure(figsize=(4,3))
#fig.patch.set_visible(False)
sns.barplot(data=scores_grouped, x='branch', y='MAPE_mean', edgecolor='black', errwidth=0, hue='model', palette=['#96ed89', '#4192d9', '#2c1dff'],
                    order=['Metro', 'Train_Station', 'Center'])
plt.ylabel('Average prediction error [%]', fontsize=12)
plt.yticks(np.arange(0, 26, 5), fontsize=11)
plt.xlabel(None)
plt.xticks(ticks=np.arange(0, 3), labels=['Metro', 'Train Station', 'Center'], fontsize=12, rotation=45, ha='right');
plt.title('Model Comparison', fontsize=14)
leg= plt.legend(bbox_to_anchor=(1.01, 0.4), loc='upper left', frameon=False, fontsize=11)
leg.get_texts()[0].set_text('Baseline')
leg.get_texts()[1].set_text('LightGBM time')
leg.get_texts()[2].set_text('LightGBM weather')

plt.show()

In [None]:
fig = plt.figure(figsize=(4,3))
fig.patch.set_visible(False)
sns.barplot(data=scores_grouped, x='branch', y='MAPE_std', edgecolor='black', errwidth=0, hue='model', palette=['#96ed89', '#4192d9', '#2c1dff'],
                    order=['Metro', 'Train_Station', 'Center'])
plt.ylabel('Variability of prediction error [%]', fontsize=12)
plt.yticks(np.arange(0, 26, 5), fontsize=11)
plt.xlabel(None)
plt.xticks(ticks=np.arange(0, 3), labels=['Metro', 'Train Station', 'Center'], fontsize=12, rotation=45, ha='right');
plt.title('Model Comparison', fontsize=14)
leg= plt.legend(bbox_to_anchor=(1.01, 0.4), loc='upper left', frameon=False, fontsize=11)
leg.get_texts()[0].set_text('Baseline')
leg.get_texts()[1].set_text('LightGBM time')
leg.get_texts()[2].set_text('LightGBM weather')

plt.show()

### calculate feature importance for optimized LGBM model

In [None]:
def get_lgbm_feature_importance(df_train, grouping_vars, target, features, lgbm_kwargs=None, filepath=None):
    """Extracting feature importance from LightGBM model:
    Trains separate LightGBM models for individual time series in a stacked time series training dataframe and extracts feature importances.
    Returns dataframe containing feature importances for each individual time series. 

    Args:
        df_train (pd.DataFrame): A training dataframe containing stacked time series data
        grouping_vars (list): A list of grouping variables, according to which training data is stacked. Currently accepts only a list of two variables.
        target (str): Prediction target
        features (list): List of feature names to be used for training the model
        lgbm_kwargs (dict, optional): Dictionary of LGBM hyperparameters. Defaults to None. If None, model is trained using default hyperparameters.
        filepath (str, optional): File path for saving trained model. Defaults to None.

    Returns:
        pd.DataFrame: a dataframe containing feature importances for all indivudal time series.
    """
    # initialize empty dataframe with group column and feature columns
    fimportance = pd.DataFrame({}, columns=['group']+features)

    # iterate over all individual series and fit LightGBM model
    for i, group in enumerate(product(df_train[grouping_vars[0]].unique(), df_train[grouping_vars[1]].unique())):

        # subselect time series
        ts_train = df_train[(df_train[grouping_vars[0]]==group[0]) & (df_train[grouping_vars[1]]==group[1])].copy()


        X_train = ts_train[features]
        y_train = ts_train[target]

        if lgbm_kwargs!=None:
            lgbm = LGBMRegressor(objective='regression', random_state=42, importance_type='gain', **lgbm_kwargs)
        else:
            lgbm = LGBMRegressor(objective='regression', random_state=42, importance_type='gain')
        
        # train model
        lgbm.fit(X_train, y_train)
        
        # save model if filepath specified
        if filepath != None:
            lgbm.booster_.save_model(filename=os.path.join(filepath, f'lgbm_{group[0]}_{group[1]}.txt'))

        # append feature importances per time series to dataframe
        fimportance.loc[i, 'group'] = f'{group[0]} | {group[1]}'
        fimportance.loc[i, fimportance.columns[1:]] = lgbm.feature_importances_.tolist()
    # calculate mean feature importance averaged over all individual time series
    fimportance.loc[i+1, 'group'] = 'mean'
    fimportance.loc[i+1, fimportance.columns[1:]] = [fimportance[x].mean() for x in fimportance.columns[1:]]

    return fimportance

In [None]:
lgbm_fimportance = get_lgbm_feature_importance(df_train, grouping_vars=['branch', 'product'], target='turnover', features=weather_features,
                            lgbm_kwargs=params_optimal, filepath=None)

In [None]:
# calculate feature importance by gain in percent relative to total gain for that time series
lgbm_fimportance_rel = lgbm_fimportance.copy()
lgbm_fimportance_rel['sum'] = lgbm_fimportance_rel[lgbm_fimportance_rel.columns[1:]].sum(axis=1)

lgbm_fimportance_rel[lgbm_fimportance_rel.columns[1:]] = lgbm_fimportance_rel[lgbm_fimportance_rel.columns[1:]].div(lgbm_fimportance_rel['sum'], axis=0) * 100
lgbm_fimportance_rel.drop(columns=['sum'], inplace=True)

In [None]:
# replace mean with averaged relative importances per over all branch/product combination
lgbm_fimportance_rel.loc[15, lgbm_fimportance_rel.columns[1:]] = [np.mean(lgbm_fimportance_rel[x]) for x in lgbm_fimportance_rel.columns[1:]]

In [None]:
lgbm_fi_stacked = lgbm_fimportance_rel.set_index('group').stack().reset_index()
lgbm_fi_stacked.columns = ['group', 'features', 'importance']
lgbm_fi_stacked = lgbm_fi_stacked[lgbm_fi_stacked['group']!='mean']

In [None]:
# extract branch and product information as separate columns
lgbm_fi_stacked['branch'] = [x.split(' | ')[0] for x in lgbm_fi_stacked['group']]
lgbm_fi_stacked['product'] = [x.split(' | ')[1] for x in lgbm_fi_stacked['group']]

In [None]:
col_order_mean = lgbm_fimportance_rel.set_index('group').sort_values(by='mean', axis=1, ascending=False).columns
col_order_mean

In [None]:
# rename features during plotting using feature map
feature_map = {'turnover_lag_7':'turnover [lag 7]', 
                'day_of_week':'day of week', 
                'public_holiday': 'public holiday', 
                'turnover_lag_365': 'turnover [lag 365]',
                'temp_mean': 'temperature [daily mean]', 
                'snow_1h_mean_dev': 'snowfall [season. dev.]', 
                'month_cos': 'month [cosine-t.]', 
                'school_holiday': 'school holiday',
                'temp_mean_dev': 'temperature [season. dev.]', 
                'temp_mean_lead_1': 'temperature [next day]', 
                'month_sin': 'month [sine-t.]', 
                'rain_1h_mean_dev': 'rainfall [season. dev.]',
                'humidity_mean': 'humidity [daily mean]', 
                'pressure_mean_dev': 'atm. pressure [season. dev.]', 
                'humidity_mean_dev': 'humidity [season. dev.]',
                'pressure_mean_change': 'atm. pressure [change]', 
                'temp_mean_change': 'temperature [change]', 
                'humidity_mean_change': 'humidity [change]',
                'rain_1h_mean': 'rainfall [daily mean]', 
                'rain_1h_mean_lead_1': 'rainfall [next day]', 
                'day_hazy': 'hazy day', 
                'day_clear': 'clear day',
                'day_frosty': 'frosty day', 
                'day_summer': 'summer day', 
                'snow_1h_mean_lead_1': 'snowfall [next day]', 
                'snow_1h_mean': 'snowfall [daily mean]',
                'day_thunder': 'thunder day'
                }

### plot global feature importance averaged over all groups

In [None]:
fig = plt.figure(figsize=(7, 6))
#fig.patch.set_visible(False)
sns.barplot(data=lgbm_fi_stacked, y='features', x='importance', color='#2c1dff', edgecolor='black', errwidth=0, order=col_order_mean)
plt.xlabel('Relative Importance [%]', fontsize=12)
plt.xticks(ticks=np.arange(0, 81, 20), labels=np.arange(0, 81, 20), fontsize=11)
plt.ylabel(None)
plt.yticks(ticks=np.arange(0, 27), labels=col_order_mean.map(feature_map), fontsize=10)
plt.title('Feature Importance', fontsize=14)
plt.show()


### plot feature importance of weather features only separately for each branch

In [None]:
for product in lgbm_fi_stacked['product'].unique():
    fimp = lgbm_fimportance_rel.copy()
    fimp.drop(['turnover_lag_7', 'turnover_lag_365', 'month_sin', 'month_cos', 'day_of_week', 'school_holiday', 'public_holiday'], 
                    axis=1, inplace=True)
    feature_cols = fimp.columns[1:]
    fimp.loc[:14, 'branch'] = [x.split(' | ')[0] for x in fimp.loc[:14, 'group']]
    fimp.loc[:14, 'product'] = [x.split(' | ')[1] for x in fimp.loc[:14, 'group']]
    fimp = fimp[fimp['product']==product]
    fimp.loc[3, 'group'] = 'mean'
    fimp.loc[3, feature_cols] = [np.mean(fimp[x]) for x in feature_cols]
    temp_order = fimp[fimp.columns[:-2]].set_index('group').sort_values(by='mean', axis=1, ascending=False).columns
    
    fig = plt.figure(figsize=(4, 5))
    fig.patch.set_visible(False)
    sns.barplot(data=lgbm_fi_stacked[lgbm_fi_stacked['product']==product], y='features', x='importance', 
                    color='#2c1dff', edgecolor='blue', errwidth=0, order=temp_order)
    plt.xlabel('Relative Importance [%]', fontsize=12)
    plt.xticks(ticks=np.arange(0, 6, 1), labels=np.arange(0, 6, 1), fontsize=11)
    plt.ylabel(None)
    plt.yticks(ticks=np.arange(0, 20), labels=temp_order.map(feature_map), fontsize=10)
    plt.title(f'Feature Importance - {product}', fontsize=14)
    plt.show()

### predict new cases
Here, we will generate predictions for the test set. First, we will define a utility function to perform predictions on the test set for defined time window (restricted to 7 days).

In [None]:
# utility function for predicting y-values using LightGBM for specified time window
def LGBM_predict(df_train, df_test, grouping_vars, target, features, lgbm_kwargs, start_date, end_date, compute_shap=False, plot=False, show_baseline=False):
    """Predict target values using LightGBM for defined time window:
    Fits a LightGBM model to each individual time series and generates a prediction based on a specified time window in the test dataset.
    Can accept any time window length specified by start_date and end_date. However, it is highly recommended to set the prediction time window 
    to a maximum of 7 days. Also computes shap values and plots prediction results for specified time window if specified.

    Args:
        df_train (pd.DataFrame): A training dataframe containing stacked time series data
        df_test (pd.DataFrame): A test dataframe containing stacked time series data
        grouping_vars (list): A list of grouping variables, according to which training data is stacked. Currently accepts only a list of two variables.
        target (str): Prediction target
        features (list): List of feature names to be used for training the model
        lgbm_kwargs (dict, optional): Dictionary of LGBM hyperparameters. Defaults to None. If None, model is trained using default hyperparameters.
        start_date (str): Start date of prediction time window
        end_date (str): End date of prediction time window
        compute_shap (bool, optional): Compute shap values for each prediction. Defaults to False.
        plot (bool, optional): Display time series plots for observed and predicted values. Defaults to False.
        show_baseline (bool, optional): Include predictions by Seasonal Naive baseline in time series plots. Defaults to False.

    Returns:
        dict: Dictionary containg predicted y-values (y_pred), observed y-values (y_true), MAPE scores and shap values associated with
        the predictions for each individual time series.
    """
    # initialize dataframe for evaluation scores
    preds = {'group': [], 'y_pred': [], 'y_true': [], 'MAPE': [], 'shap': []}

    # iterate over all individual series
    for i, group in enumerate(product(df_test[grouping_vars[0]].unique(), df_test[grouping_vars[1]].unique())):

        # subselect time series for train and test
        ts_test = df_test[(df_test[grouping_vars[0]]==group[0]) & (df_test[grouping_vars[1]]==group[1])].copy()
        ts_train = df_train[(df_train[grouping_vars[0]]==group[0]) & (df_train[grouping_vars[1]]==group[1])].copy()
        
        # generate target and feature vectors
        X_train = ts_train[features]
        X_test = ts_test[features]
        y_train = ts_train[target]
        y_test = ts_test[target]

        # initialize model
        if lgbm_kwargs==None:
            lgbm = LGBMRegressor(objective='regression', random_state=42)
        else:
            lgbm = LGBMRegressor(objective='regression', random_state=42, **lgbm_kwargs)
        # fit model to train data
        lgbm.fit(X_train, y_train)

        # extract 7d prediction sample and predict
        y_test_sample = y_test[(y_test.index >= pd.to_datetime(start_date)) & (y_test.index <= pd.to_datetime(end_date))]
        X_test_sample = X_test[(X_test.index >= pd.to_datetime(start_date)) & (X_test.index <= pd.to_datetime(end_date))]
        y_pred = pd.Series(lgbm.predict(X_test_sample))

        # also extract prediction from naive baseline to show for comparison if required
        y_pred_naive = X_test_sample['turnover_lag_7'].fillna(1)
        # correct for holiday effects; if holiday is in prediced y-values, replace by sales 14 days ago
        if 1 in y_pred_naive.unique():
            idx_naive = [i for i in range(len(y_pred_naive.tolist())) if y_pred_naive.tolist()[i]==1]
            idx_naive = [i-7 for i in idx_naive]
            idx_naive_lag = [i-7 for i in idx_naive]
            y_pred_naive = X_test.loc[(X_test.index <= pd.to_datetime(end_date)), 'turnover_lag_7']
            y_pred_naive.iloc[idx_naive] = [x for x in y_train.iloc[idx_naive_lag]]
            y_pred_naive = y_pred_naive[-7:]
        
        # correct for holiday effects in validation set if necessary
        # if holiday is in validation set, drop elements at corresponding index position in both y_test and y_pred
        if 1 in y_test_sample.unique():
            idx_test = [i for i in range(len(y_test_sample.tolist())) if y_test_sample.tolist()[i]==1]
            y_test_sample = y_test_sample.drop(y_test_sample.index[idx_test])
            y_pred = y_pred.drop(y_pred.index[idx_test])
            y_pred_naive = y_pred_naive.drop(y_pred_naive.index[idx_test])

        # compute MAPE
        mape = mean_absolute_percentage_error(y_test_sample, y_pred)

        # append results
        preds['group'].append(f'{group[0]} | {group[1]}')
        preds['y_pred'].append([y_pred])
        preds['y_true'].append([y_test_sample])
        preds['MAPE'].append(mape)

        # compute shap values
        if compute_shap==True:
            explainer = shap.Explainer(lgbm)
            shap_values = explainer(X_test_sample)
            preds['shap'].append(shap_values)
        
        # plot prediction results over observed values
        if plot==True:
            sample_data = y_test_sample.reset_index()
            sample_data['y_pred_LGBM'] = y_pred.values
            sample_data['y_pred_naive'] = y_pred_naive.values
            sample_data.columns = ['date', 'observed', 'y_pred_LGBM', 'y_pred_naive']
            
            plt.figure(figsize=(6, 2))
            sns.lineplot(data=sample_data, x='date', y='observed', color='black')
            # plot also baseline predictions for reference if required
            if show_baseline==True:
                sns.lineplot(data=sample_data, x='date', y='y_pred_naive', color='red', marker='o')
            sns.lineplot(data=sample_data, x='date', y='y_pred_LGBM', color='blue', marker='o')
            plt.ylabel('Turnover [€]', fontsize=12)
            plt.yticks(fontsize=12)
            plt.ylim(0, np.max(sample_data['observed'])+100)
            plt.xlabel(None)
            plt.xticks(rotation=45, ha='right', fontsize=11)
            if show_baseline==True:
                plt.legend(labels=['observed', 'predicted by baseline', 'predicted by LGBM'], bbox_to_anchor=(1.05, 1.0), loc='upper left')
            else:
                plt.legend(labels=['observed', 'predicted'], bbox_to_anchor=(1.05, 1.0), loc='upper left')
            plt.title(f'{group[0]} | {group[1]}\n{start_date} - {end_date}', fontsize=14)
            plt.show()

    return preds

In [None]:
#lgbm_preds = LGBM_predict(df_train, df_test, grouping_vars=['branch', 'product'], target='turnover', features=weather_climat_dev_features, 
#                    lgbm_kwargs=params_optimal, start_date='2019-01-01', end_date='2019-01-07', compute_shap=True, plot=True, show_baseline=True)

### Make 1-year prediction
We will perform an error analysis based on the residuals generated by the optimized LGBM model. To this end, we will first define a utility function to extract observed values, predicted values, and residuals from cross-validation.

In [None]:
# utility function for LightGBM
def compare_models(df_train, grouping_vars, target, features, lgbm_kwargs=None, splits=52, test_size=7, gap=0):
    """Compute residuals based on predictions generated by LightGBM across TimeSeriesSplit Cross-Validation folds.
    Takes in a training dataset of stacked time series and performs TimeSeriesSplit Cross-Validation for LightGBM model 
    for each of those time series. Returns a dataframe of observed and predicted target values and corresponding residuals from
    subsequent cross-validation folds per individual time series.

    Args:
        df_train (pd.DataFrame): A training dataframe containing stacked time series data
        grouping_vars (list): A list of grouping variables, according to which training data is stacked. Currently accepts only a list of two variables.
        target (str): Prediction target
        features (list): List of feature names to be used for training the model
        lgbm_kwargs (dict, optional): Dictionary of LGBM hyperparameters. Defaults to None. If None, model is trained using default hyperparameters.
        splits (int, optional): Number of splits for Cross-Validation. Defaults to 52 (1 fold / week).
        test_size (int, optional): Size of validation set (i.e. forecasting horizon). Defaults to 7 days.
        gap (int, optional): Time gap between end of training and start of validation set. Defaults to 0.

    Returns:
        dict: Dictionary containing datasets of observed and LightGBM-predicted target values and corresponding residuals 
        from subsequent cross-validation folds per time series.
    """
    # initialize dataframe for evaluation scores
    resids = {'group': [], 'combined': []}

    # iterate over all individual series and perform cross-validation
    from itertools import product
    for i, group in enumerate(product(df_train[grouping_vars[0]].unique(), df_train[grouping_vars[1]].unique())):

        # subselect time series
        ts = df_train[(df_train[grouping_vars[0]]==group[0]) & (df_train[grouping_vars[1]]==group[1])].copy()

        # perform cross validation
        tss = TimeSeriesSplit(n_splits=splits, test_size=test_size, gap=gap)
        # initialize empty dataframe for concatenating results from individual cross-validation folds
        combined_local = pd.DataFrame({})
        for train_i, val_i in tss.split(ts):

            train = ts.iloc[train_i]
            val = ts.iloc[val_i]

            # generate target and feature vectors
            X_train = train[features]
            X_val = val[features]
            y_train = train[target]
            y_val = val[target]

            # initialize LGBM model
            if lgbm_kwargs==None:
                lgbm = LGBMRegressor(objective='regression', random_state=42)
            else:
                lgbm = LGBMRegressor(objective='regression', random_state=42, **lgbm_kwargs)
            # train model
            lgbm.fit(X_train, y_train)

            y_pred_lgbm= pd.Series(lgbm.predict(X_val))

            # make prediction using Naive Seasonal baseline
            # correct for holiday effects in predicted values based on training set if necessary
            # if 1 (representing missing values on a holiday) is in predicted y-values, replace by sales 14 days ago
            if 1 in y_train[-7:].unique():
                idx_train = [i for i in range(len(y_train[-7:].tolist())) if y_train[-7:].tolist()[i]==1]
                idx_train = [i-7 for i in idx_train]
                idx_train_lag = [i-7 for i in idx_train]
                y_train_repl = y_train.copy()
                y_train_repl.iloc[idx_train] = [x for x in y_train.iloc[idx_train_lag]]
                y_pred_naive = y_train_repl[-7:]
            else:
                y_pred_naive = y_train[-7:]

            # correct for holiday effects in validation set if necessary
            # if holiday is in validation set, drop elements at corresponding index position in both y_val and y_pred
            if 1 in y_val.unique():
                idx_val = [i for i in range(len(y_val.tolist())) if y_val.tolist()[i]==1]
                y_val = y_val.drop(y_val.index[idx_val])
                y_pred_lgbm = y_pred_lgbm.drop(y_pred_lgbm.index[idx_val])
                y_pred_naive = y_pred_naive.drop(y_pred_naive.index[idx_val])

            combined = pd.DataFrame(y_val.copy())
            combined.columns = ['y_true']
            combined['y_pred_lgbm'] = y_pred_lgbm.values
            combined['y_pred_baseline'] = y_pred_naive.values
            # here, we will subtract observed from predicted values, such that positive residuals correspond to over-estimation and negative residuals to underestimation
            combined['residual_lgbm'] = combined['y_pred_lgbm'] - combined['y_true']
            combined['residual_baseline'] = combined['y_pred_baseline'] - combined['y_true']

            combined_local = pd.concat([combined_local, combined], axis=0)

        # append scores
        resids['group'].append(f'{group[0]} | {group[1]}')
        resids['combined'].append(combined_local)
    
    return resids

In [None]:
model_preds = compare_models(df_train, grouping_vars=['branch', 'product'], target='turnover', features=weather_features,
                              lgbm_kwargs=params_optimal, splits=52)

In [None]:
model_preds['combined'][0]

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 4))
fig.suptitle(model_preds['group'][0])
sns.lineplot(data=model_preds['combined'][0], x=model_preds['combined'][0].index, y='y_true', color='grey', 
                    label='observed', ax=ax1)
sns.lineplot(data=model_preds['combined'][0], x=model_preds['combined'][0].index, y='y_pred_baseline', color='#96ed89',
                    label='predicted by Baseline', ax=ax1)
sns.lineplot(data=model_preds['combined'][0], x=model_preds['combined'][0].index, y='y_pred_lgbm', color='#2c1dff', 
                    label='predicted by LightGBM', ax=ax1)
ax1.set_ylabel('Turnover [€]', fontsize=12)
ax1.set_yticks(ticks=np.arange(0, 401, 100))
ax1.set_yticklabels(labels=np.arange(0, 401, 100), fontsize=11)
ax1.set_xlabel(None)
ax1.legend(bbox_to_anchor=(1.01, 0.4), loc='upper left', frameon=False, fontsize=10)

sns.scatterplot(data=model_preds['combined'][0], x=model_preds['combined'][0].index, y='residual_baseline', 
                color='#96ed89', edgecolor='black', label='Baseline', ax=ax2)
sns.scatterplot(data=model_preds['combined'][0], x=model_preds['combined'][0].index, y='residual_lgbm', 
                color='#2c1dff', edgecolor='black', label='LightGBM', ax=ax2)
ax2.set_ylabel('Model error [€]', fontsize=12)
ax2.set_yticks(ticks=np.arange(-200, 201, 100))
ax2.set_yticklabels(labels=np.arange(-200, 201, 100), fontsize=11)
ax2.set_xlabel(None)
ax2.legend(bbox_to_anchor=(1.01, 0.4), loc='upper left', frameon=False, fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
summed_error = pd.DataFrame({'group': [], 'pos_baseline': [], 'neg_baseline': [], 'pos_lgbm': [], 'neg_lgbm': []})
for i in range(len(model_preds['group'])):
    data_temp = model_preds['combined'][i]
    sum_pos_baseline = data_temp[data_temp['residual_baseline']>=0]['residual_baseline'].sum() 
    sum_neg_baseline = data_temp[data_temp['residual_baseline']<0]['residual_baseline'].sum() 
    sum_pos_lgbm = data_temp[data_temp['residual_lgbm']>=0]['residual_lgbm'].sum() 
    sum_neg_lgbm = data_temp[data_temp['residual_lgbm']<0]['residual_lgbm'].sum() 

    summed_error.loc[i, 'group'] = model_preds['group'][i]
    summed_error.loc[i, 'pos_baseline'] = sum_pos_baseline
    summed_error.loc[i, 'neg_baseline'] = sum_neg_baseline
    summed_error.loc[i, 'pos_lgbm'] = sum_pos_lgbm
    summed_error.loc[i, 'neg_lgbm'] = sum_neg_lgbm

In [None]:
summed_error.loc[15, 'group'] = 'sum'
summed_error.loc[15, summed_error.columns[1:]] = [np.sum(summed_error[x]) for x in summed_error.columns[1:]]
summed_error

In [None]:
summed_error_grouped = summed_error.set_index('group').stack().reset_index()
summed_error_grouped.columns=['group', 'error_type', 'error']
summed_error_grouped['model'] = [x.split('_')[1] for x in summed_error_grouped['error_type']]
summed_error_grouped['error_type'] = [x.split('_')[0] for x in summed_error_grouped['error_type']]
summed_error_grouped['error_type'].replace('pos', 'overestimation', inplace=True)
summed_error_grouped['error_type'].replace('neg', 'underestimation', inplace=True)
summed_error_grouped.tail()

In [None]:
fig = plt.figure(figsize=(4,3))
#fig.patch.set_visible(False)
sns.barplot(data=summed_error_grouped[summed_error_grouped['group']=='sum'], 
                x='error_type', y='error', edgecolor='black', errwidth=0, hue='model', palette={'baseline':'#34831B', 'lgbm':'#0C2E3A'})
plt.ylabel('Yearly summed error [€]', fontsize=12)
plt.yticks(ticks=np.arange(-200000, 200001, 50000), fontsize=11)
plt.xlabel(None)
plt.xticks(ticks=np.arange(0, 2), fontsize=12);
plt.title('Financial loss\ndue to over- or underestimation', fontsize=14)
leg= plt.legend(bbox_to_anchor=(1.05, 0.3), loc='upper left', frameon=False, fontsize=11)
leg.get_texts()[0].set_text('Baseline')
leg.get_texts()[1].set_text('LightGBM')

plt.show()