In [8]:
import os
                        
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from darts import TimeSeries
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, InvertibleMapper, StaticCovariatesTransformer
from darts.dataprocessing.transformers.missing_values_filler import MissingValuesFiller
from darts.models import LinearRegressionModel, LightGBMModel, XGBModel, CatBoostModel
from darts.models.filtering.moving_average_filter import MovingAverageFilter
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tqdm.notebook import tqdm_notebook

plt.style.use("ggplot")
plt.rcParams["font.size"] = 15
COLORS = list(sns.color_palette())

In [9]:
# helper function to print messages
def cprint(title, *args):
    print(
        "="*len(title), title, "="*len(title),
        *args,
        sep="\n",
    )

### Loading the datasets.

In [11]:
data = pd.read_csv('originalni_datasetovi/pre_validate.csv', parse_dates=['date'])
data.head()


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,date,store_nbr,family,id,sales,onpromotion,transactions,...,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,wageday,day_to_nearest_holiday,day_from_nearest_holiday
0,0,0,0,2013-01-01,store_nbr_1,AUTOMOTIVE,0.0,,,0.0,...,1,0,1,0,1,0,0,0,0,0
1,1,1,1,2013-01-01,store_nbr_1,BABY CARE,1.0,,,0.0,...,1,0,1,0,1,0,0,0,0,0
2,2,2,2,2013-01-01,store_nbr_1,BEAUTY,2.0,,,0.0,...,1,0,1,0,1,0,0,0,0,0
3,3,3,3,2013-01-01,store_nbr_1,BEVERAGES,3.0,,,0.0,...,1,0,1,0,1,0,0,0,0,0
4,4,4,4,2013-01-01,store_nbr_1,BOOKS,4.0,,,0.0,...,1,0,1,0,1,0,0,0,0,0


In [12]:
def get_pipeline(static_covs_transform=False, log_transform=False):
    lst = []
    
    # fill missing values
    filler = MissingValuesFiller(n_jobs=-1)
    lst.append(filler)
    
    # specify transformation for static covariates
    if static_covs_transform:
        static_covs_transformer = StaticCovariatesTransformer(
            transformer_cat=OneHotEncoder(),
            n_jobs=-1,
        )
        lst.append(static_covs_transformer)

    # perform log transformation on sales
    if log_transform:
        log_transformer = InvertibleMapper(
            fn=np.log1p,
            inverse_fn=np.expm1,
            n_jobs=-1,
        )
        lst.append(log_transformer)

    # rescale time series
    scaler = Scaler()
    lst.append(scaler)

    # chain all transformations
    pipeline = Pipeline(lst)
    return pipeline

train_end = '2017-08-15'

In [13]:
def get_target_series(static_cols, log_transform=True):    
    target_dict = {}
    pipe_dict = {}
    id_dict = {}

    for fam in tqdm_notebook(data.family.unique(), desc="Extracting target series"):
        # filter data for each model
        df = data[(data.family.eq(fam)) & (data.date.le(train_end))]
        
        # initialize transformation pipeline for target series
        pipe = get_pipeline(True, log_transform=log_transform)
        
        # extract target series together with static covariates
        target = TimeSeries.from_group_dataframe(
            df=df,
            time_col="date",
            value_cols="sales",
            group_cols="store_nbr",
            static_cols=static_cols,
        )

        # record identity of each target series
        target_id = [{"store_nbr": t.static_covariates.store_nbr[0], "family": fam} 
                     for t in target]
        id_dict[fam] = target_id
        
        # apply transformations
        target = pipe.fit_transform(target)
        target_dict[fam] = [t.astype(np.float32) for t in target]
        pipe_dict[fam] = pipe[2:]
        
    return target_dict, pipe_dict, id_dict

In [14]:
# list of static covariates excluding 'store_nbr'; 'store_nbr' is automatically extracted using 'group_cols'
static_cols = ["city", "state", "type", "cluster"]

target_dict, pipe_dict, id_dict = get_target_series(static_cols)

Extracting target series:   0%|          | 0/33 [00:00<?, ?it/s]

### Extracting the past and future covariates.

For past covariates, we only have the `transactions` column. For future covariates, we have the `oil`, `onpromotion` columns, the holiday columns, as well as the date-related columns. To improve our models, we can extract additional covariates by computing the moving averages of our time series data using `MovingAverageFilter`. Doing so helps to smooth out the noise and capture the underlying patterns more effectively.

\* *We follow the reference notebook to use moving averages of `oil` and `onpromotion` with window sizes 7, 28. The code below supports the computation of moving averages for only the past and future covariates. To include the moving averages of the target series `sales`, some edits are needed.*

In [18]:
def get_covariates(
    past_cols,
    future_cols,
    past_ma_cols=None,
    future_ma_cols=None,
    past_window_sizes=[7, 28],
    future_window_sizes=[7, 28],
):
    past_dict = {}
    future_dict = {}
    
    # initialize transformation pipeline for covariates
    covs_pipe = get_pipeline()

    for fam in tqdm_notebook(data.family.unique(), desc="Extracting covariates"):
        # filter data for each model
        df = data[data.family.eq(fam)]
        
        # extract past covariates
        past_covs = TimeSeries.from_group_dataframe(
            df=df[df.date.le(train_end)],
            time_col="date",
            value_cols=past_cols,
            group_cols="store_nbr",
        )
        past_covs = [p.with_static_covariates(None) for p in past_covs]
        past_covs = covs_pipe.fit_transform(past_covs)
        if past_ma_cols is not None:
            for size in past_window_sizes:
                ma_filter = MovingAverageFilter(window=size)
                old_names = [f"rolling_mean_{size}_{col}" for col in past_ma_cols]
                new_names = [f"{col}_ma{size}" for col in past_ma_cols]
                past_ma_covs = [
                    ma_filter.filter(p[past_ma_cols]).with_columns_renamed(old_names, new_names) 
                    for p in past_covs
                ]
                past_covs = [p.stack(p_ma) for p, p_ma in zip(past_covs, past_ma_covs)]
        
        past_dict[fam] = [p.astype(np.float32) for p in past_covs]

        # extract future covariates
        future_covs = TimeSeries.from_group_dataframe(
            df=df,
            time_col="date",
            value_cols=future_cols,
            group_cols="store_nbr",
        )
        future_covs = [f.with_static_covariates(None) for f in future_covs]
        future_covs = covs_pipe.fit_transform(future_covs)
        if future_ma_cols is not None:
            for size in future_window_sizes:
                ma_filter = MovingAverageFilter(window=size)
                old_names = [f"rolling_mean_{size}_{col}" for col in future_ma_cols]
                new_names = [f"{col}_ma{size}" for col in future_ma_cols]
                future_ma_covs = [
                    ma_filter.filter(f[future_ma_cols]).with_columns_renamed(old_names, new_names) 
                    for f in future_covs
                ]
                future_covs = [f.stack(f_ma) for f, f_ma in zip(future_covs, future_ma_covs)]
        
        future_dict[fam] = [f.astype(np.float32) for f in future_covs]
            
    return past_dict, future_dict

selected_holidays = [
    "nat_terremoto", "nat_navidad", "nat_dia la madre", "nat_dia trabajo",
    "nat_primer dia ano", "nat_futbol", "nat_dia difuntos",
]

In [20]:
# 0.38067 sa svim
# 0.37984 bez icega



# past covariates
past_cols = ["transactions"]

# future covariates
future_cols = [
    "oil", "onpromotion",
    "day", "month", "year", "day_of_week", "day_of_year", "week_of_year", "date_index",
    "work_day", *selected_holidays,
]

holidays_to_add = ['N Batalla de Pichincha', 'N Carnaval', 'N Cyber Monday', 'N Independencia de Cuenca', 'N Independencia de Guayaquil', 'N Viernes Santo']

for new_holiday in holidays_to_add:
    future_cols.append(new_holiday)

time_based_to_add = ['day_of_month', 'is_wknd', 
       'is_year_end', 'wageday', 'day_to_nearest_holiday', 'day_from_nearest_holiday', 'is_quarter_start', 'week_of_month',
       'is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_start', 'is_month_end']

izbaceni = ['is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_start', 'is_month_end']

for time_based in time_based_to_add:
    future_cols.append(time_based)

# additional past and future covariates from computing the moving averages
past_ma_cols = None
future_ma_cols = ["oil", "onpromotion"]

past_dict, future_dict = get_covariates(past_cols, future_cols, past_ma_cols, future_ma_cols)

### Setting up the model trainer.

We are now done with extracting the time series data for forecasting with Darts. The complete list of covariates is shown below.

In [None]:
TRAINER_CONFIG = {
    # the time series data previously extracted
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": 16, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 21,
    
    # specify the covariates in a list to include in the model
    # set to None to not use any, and set to 'keep_all' to include everything
    "static_covs": "keep_all", # specify from ['city', 'state', 'cluster', 'type', 'store_nbr'], will extract all one-hot encoded columns
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

In [16]:
from sklearn.metrics import mean_squared_log_error
class Trainer:
    def __init__(
        self,
        target_dict,
        pipe_dict,
        id_dict,
        past_dict,
        future_dict,
        forecast_horizon,
        folds,
        zero_fc_window,
        static_covs=None,
        past_covs=None,
        future_covs=None,
    ):
        self.target_dict = target_dict.copy()
        self.pipe_dict = pipe_dict.copy()
        self.id_dict = id_dict.copy()
        self.past_dict = past_dict.copy()
        self.future_dict = future_dict.copy()
        self.forecast_horizon = forecast_horizon
        self.folds = folds
        self.zero_fc_window = zero_fc_window
        self.static_covs = static_covs
        self.past_covs = past_covs
        self.future_covs = future_covs
        
        # set up time series data
        self.setup()
    
    def setup(self):
        for fam in tqdm_notebook(self.target_dict.keys(), desc="Setting up"):
            # keep the specified static covariates
            if self.static_covs != "keep_all":
                if self.static_covs is not None:
                    target = self.target_dict[fam]
                    keep_static = [col for col in target[0].static_covariates.columns if col.startswith(tuple(self.static_covs))]
                    static_covs_df = [t.static_covariates[keep_static] for t in target]
                    self.target_dict[fam] = [t.with_static_covariates(d) for t, d in zip(target, static_covs_df)]
                else:
                    self.target_dict[fam] = [t.with_static_covariates(None) for t in target]
            
            # keep the specified past covariates
            if self.past_covs != "keep_all":
                if self.past_covs is not None:
                    self.past_dict[fam] = [p[self.past_covs] for p in self.past_dict[fam]]
                else:
                    self.past_dict[fam] = None
                
            # keep the specified future covariates
            if self.future_covs != "keep_all":
                if self.future_covs is not None:
                    self.future_dict[fam] = [p[self.future_covs] for p in self.future_dict[fam]]
                else:
                    self.future_dict[fam] = None
    
    def clip(self, array):
        return np.clip(array, a_min=0., a_max=None)
    
    def train_valid_split(self, target, length):
        train = [t[:-length] for t in target]
        valid_end_idx = -length + self.forecast_horizon
        if valid_end_idx >= 0:
            valid_end_idx = None
        valid = [t[-length:valid_end_idx] for t in target]
        
        return train, valid
    
    def get_models(self, model_names, model_configs):
        models = {
            "lr": LinearRegressionModel,
            "lgbm": LightGBMModel,
            "cat": CatBoostModel,
            "xgb": XGBModel,
        }
        assert isinstance(model_names, list) and isinstance(model_configs, list),\
        "Both the model names and model configurations must be specified in lists."
        assert all(name in models for name in model_names),\
        f"Model names '{model_names}' not recognized."
        assert len(model_names) == len(model_configs),\
        "The number of model names and the number of model configurations do not match."
        
        if "xgb" in model_names:
            xgb_idx = np.where(np.array(model_names)=="xgb")[0]
            for idx in xgb_idx:
                # change to histogram-based method for XGBoost to get faster training time
                model_configs[idx] = {"tree_method": "hist", **model_configs[idx]}
        
        return [models[name](**model_configs[j]) for j, name in enumerate(model_names)]
    
    def generate_forecasts(self, models, train, pipe, past_covs, future_covs, drop_before):
        if drop_before is not None:
            date = pd.Timestamp(drop_before) - pd.Timedelta(days=1)
            train = [t.drop_before(date) for t in train]
        inputs = {
            "series": train,
            "past_covariates": past_covs,
            "future_covariates": future_covs,
        }
        zero_pred = pd.DataFrame({
            "date": pd.date_range(train[0].end_time(), periods=self.forecast_horizon+1)[1:],
            "sales": np.zeros(self.forecast_horizon),
        })
        zero_pred = TimeSeries.from_dataframe(
            df=zero_pred,
            time_col="date",
            value_cols="sales",
        )
        
        pred_list = []
        ens_pred = [0 for _ in range(len(train))]
        
        for m in models:
            # fit training data to model
            m.fit(**inputs)
                
            # generate forecasts and apply inverse transformations
            pred = m.predict(n=self.forecast_horizon, **inputs, show_warnings=False)
            
            #print(m.feature_imprtance_)
            pred = pipe.inverse_transform(pred)

            # set zero forecasts for target series where the recent observations are 0s
            for j in range(len(train)):
                if train[j][-self.zero_fc_window:].values().sum() == 0:
                    pred[j] = zero_pred
            
            # clip negative forecasts to 0s
            pred = [p.map(self.clip) for p in pred]
            pred_list.append(pred)
            
            # ensemble averaging
            for j in range(len(ens_pred)):
                ens_pred[j] += pred[j] / len(models)

        return pred_list, ens_pred
    
    def metric(self, valid, pred):
        valid_df = pd.concat([ts.pd_dataframe() for ts in valid], axis=1)
        pred_df = pd.concat([ts.pd_dataframe() for ts in pred], axis=1)

        # calculate RMSLE for each pair of valid and predicted values
        rmsle_values = [mean_squared_log_error(valid_df[col], pred_df[col],squared=False) for col in valid_df.columns]

        # calculate the mean of RMSLE values of all series of that family
        mean_rmsle = np.mean(rmsle_values)

        return mean_rmsle
    
    def validate(self, model_names, model_configs, drop_before=None):
        # helper value to align printed text below
        longest_len = len(max(self.target_dict.keys(), key=len))
        
        # store metric values for each model
        model_metrics_history = []
        ens_metric_history = []
        
        for fam in tqdm_notebook(self.target_dict, desc="Performing validation"):
            target = self.target_dict[fam]
            pipe = self.pipe_dict[fam]
            past_covs = self.past_dict[fam]
            future_covs = self.future_dict[fam]
            
            # record average metric value over all folds
            model_metrics = []
            ens_metric = 0
            
            for j in range(self.folds):    
                # perform train-validation split and apply transformations
                length = (self.folds - j) * self.forecast_horizon
                train, valid = self.train_valid_split(target, length)
                valid = pipe.inverse_transform(valid)

                # generate forecasts and compute metric
                models = self.get_models(model_names, model_configs)
                pred_list, ens_pred = self.generate_forecasts(models, train, pipe, past_covs, future_covs, drop_before)
                metric_list = [self.metric(valid, pred) / self.folds for pred in pred_list]
                model_metrics.append(metric_list)
                if len(models) > 1:
                    ens_metric_fold = self.metric(valid, ens_pred) / self.folds
                    ens_metric += ens_metric_fold
                
            # store final metric value for each model
            model_metrics = np.sum(model_metrics, axis=0)
            model_metrics_history.append(model_metrics)
            ens_metric_history.append(ens_metric)
            
            # print metric value for each family
            print(
                fam,
                " " * (longest_len - len(fam)),
                " | ",
                " - ".join([f"{model}: {metric:.5f}" for model, metric in zip(model_names, model_metrics)]),
                f" - ens: {ens_metric:.5f}" if len(models) > 1 else "",
                sep="",
            )
            
        # print overall metric value
        cprint(
            "Average RMSLE | "
            + " - ".join([f"{model}: {metric:.5f}" 
                          for model, metric in zip(model_names, np.mean(model_metrics_history, axis=0))])
            + (f" - ens: {np.mean(ens_metric_history):.5f}" if len(models) > 1 else ""),
        )
        
    def ensemble_predict(self, model_names, model_configs, drop_before=None):        
        forecasts = []
        for fam in tqdm_notebook(self.target_dict.keys(), desc="Generating forecasts"):
            target = self.target_dict[fam]
            pipe = self.pipe_dict[fam]
            target_id = self.id_dict[fam]
            past_covs = self.past_dict[fam]
            future_covs = self.future_dict[fam]
            
            # generate forecasts
            models = self.get_models(model_names, model_configs)
            pred_list, ens_pred = self.generate_forecasts(models, target, pipe, past_covs, future_covs, drop_before)
            ens_pred = [p.pd_dataframe().assign(**i) for p, i in zip(ens_pred, target_id)]
            ens_pred = pd.concat(ens_pred, axis=0)
            forecasts.append(ens_pred)
            
        # combine all forecasts into one dataframe
        forecasts = pd.concat(forecasts, axis=0)
        forecasts = forecasts.rename_axis(None, axis=1).reset_index(names="date")
        
        return forecasts

In [None]:
# initialize model trainer
trainer = Trainer(**TRAINER_CONFIG)

Setting up:   0%|          | 0/33 [00:00<?, ?it/s]

In [135]:
# Optimizacija
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 7,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    "max_depth": -1, # -1
    "subsample": 0.7,
    "alpha": 0.9, # l1 0.9
    "reg_lambda": 0, # l2  0
    "verbose":-1
}


In [136]:
# 'lr' for linear regression
trainer.validate(["lgbm"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.49525
BABY CARE                  | lgbm: 0.20741
BEAUTY                     | lgbm: 0.48484
BEVERAGES                  | lgbm: 0.23130
BOOKS                      | lgbm: 0.05323
BREAD/BAKERY               | lgbm: 0.17469
CELEBRATION                | lgbm: 0.52287
CLEANING                   | lgbm: 0.43663
DAIRY                      | lgbm: 0.15432
DELI                       | lgbm: 0.16311
EGGS                       | lgbm: 0.26682
FROZEN FOODS               | lgbm: 0.26591
GROCERY I                  | lgbm: 0.17992
GROCERY II                 | lgbm: 0.53606
HARDWARE                   | lgbm: 0.52479
HOME AND KITCHEN I         | lgbm: 0.49083
HOME AND KITCHEN II        | lgbm: 0.44825
HOME APPLIANCES            | lgbm: 0.31256
HOME CARE                  | lgbm: 0.24478
LADIESWEAR                 | lgbm: 0.42616
LAWN AND GARDEN            | lgbm: 0.37309
LINGERIE                   | lgbm: 0.62348
LIQUOR,WINE,BEER           | lgbm: 0.47338
MAGAZINES  

In [159]:
# Optimizacija
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 63,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

# 'lr' for linear regression
trainer.validate(["lgbm"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.49293
BABY CARE                  | lgbm: 0.21119
BEAUTY                     | lgbm: 0.44877
BEVERAGES                  | lgbm: 0.20345
BOOKS                      | lgbm: 0.06102
BREAD/BAKERY               | lgbm: 0.16036
CELEBRATION                | lgbm: 0.52644
CLEANING                   | lgbm: 0.28581
DAIRY                      | lgbm: 0.13659
DELI                       | lgbm: 0.16829
EGGS                       | lgbm: 0.25004
FROZEN FOODS               | lgbm: 0.25336
GROCERY I                  | lgbm: 0.14157
GROCERY II                 | lgbm: 0.56561
HARDWARE                   | lgbm: 0.52025
HOME AND KITCHEN I         | lgbm: 0.49171
HOME AND KITCHEN II        | lgbm: 0.42672
HOME APPLIANCES            | lgbm: 0.29148
HOME CARE                  | lgbm: 0.21383
LADIESWEAR                 | lgbm: 0.40863
LAWN AND GARDEN            | lgbm: 0.36433
LINGERIE                   | lgbm: 0.62873
LIQUOR,WINE,BEER           | lgbm: 0.39833
MAGAZINES  

In [175]:
# Optimizacija
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 365,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
    "n_estimators": 200, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

# 'lr' for linear regression
trainer.validate(["lgbm"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.49193
BABY CARE                  | lgbm: 0.20948
BEAUTY                     | lgbm: 0.46268
BEVERAGES                  | lgbm: 0.20868
BOOKS                      | lgbm: 0.05638
BREAD/BAKERY               | lgbm: 0.16426
CELEBRATION                | lgbm: 0.51965
CLEANING                   | lgbm: 0.31911
DAIRY                      | lgbm: 0.12816
DELI                       | lgbm: 0.16199
EGGS                       | lgbm: 0.25156
FROZEN FOODS               | lgbm: 0.25086
GROCERY I                  | lgbm: 0.14100
GROCERY II                 | lgbm: 0.53296
HARDWARE                   | lgbm: 0.51816
HOME AND KITCHEN I         | lgbm: 0.47246
HOME AND KITCHEN II        | lgbm: 0.42236
HOME APPLIANCES            | lgbm: 0.28811
HOME CARE                  | lgbm: 0.19006
LADIESWEAR                 | lgbm: 0.41226
LAWN AND GARDEN            | lgbm: 0.39436
LINGERIE                   | lgbm: 0.62238
LIQUOR,WINE,BEER           | lgbm: 0.41207
MAGAZINES  

In [155]:
# Optimizacija
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 14,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
    "n_estimators": 150, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

# 'lr' for linear regression
trainer.validate(["lgbm"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.49571
BABY CARE                  | lgbm: 0.20934
BEAUTY                     | lgbm: 0.46487
BEVERAGES                  | lgbm: 0.21054
BOOKS                      | lgbm: 0.06738
BREAD/BAKERY               | lgbm: 0.16465
CELEBRATION                | lgbm: 0.52257
CLEANING                   | lgbm: 0.35189
DAIRY                      | lgbm: 0.15258
DELI                       | lgbm: 0.16671
EGGS                       | lgbm: 0.26439
FROZEN FOODS               | lgbm: 0.26325
GROCERY I                  | lgbm: 0.16817
GROCERY II                 | lgbm: 0.57830
HARDWARE                   | lgbm: 0.52989
HOME AND KITCHEN I         | lgbm: 0.48423
HOME AND KITCHEN II        | lgbm: 0.43698
HOME APPLIANCES            | lgbm: 0.27383
HOME CARE                  | lgbm: 0.23687
LADIESWEAR                 | lgbm: 0.41509
LAWN AND GARDEN            | lgbm: 0.36476
LINGERIE                   | lgbm: 0.63196
LIQUOR,WINE,BEER           | lgbm: 0.44284
MAGAZINES  

In [158]:
# Optimizacija
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 120,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

# 'lr' for linear regression
trainer.validate(["lgbm"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.49028
BABY CARE                  | lgbm: 0.20723
BEAUTY                     | lgbm: 0.44933
BEVERAGES                  | lgbm: 0.21386
BOOKS                      | lgbm: 0.06088
BREAD/BAKERY               | lgbm: 0.15873
CELEBRATION                | lgbm: 0.52122
CLEANING                   | lgbm: 0.28049
DAIRY                      | lgbm: 0.13595
DELI                       | lgbm: 0.16695
EGGS                       | lgbm: 0.25060
FROZEN FOODS               | lgbm: 0.25231
GROCERY I                  | lgbm: 0.14292
GROCERY II                 | lgbm: 0.54803
HARDWARE                   | lgbm: 0.51747
HOME AND KITCHEN I         | lgbm: 0.48980
HOME AND KITCHEN II        | lgbm: 0.43297
HOME APPLIANCES            | lgbm: 0.28921
HOME CARE                  | lgbm: 0.21222
LADIESWEAR                 | lgbm: 0.41669
LAWN AND GARDEN            | lgbm: 0.38105
LINGERIE                   | lgbm: 0.62556
LIQUOR,WINE,BEER           | lgbm: 0.39990
MAGAZINES  

In [44]:
# Optimizacija
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 730,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
    "n_estimators": 150, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

# 'lr' for linear regression
trainer.validate(["lgbm"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.50368
BABY CARE                  | lgbm: 0.20253
BEAUTY                     | lgbm: 0.46733
BEVERAGES                  | lgbm: 0.21781
BOOKS                      | lgbm: 0.04729
BREAD/BAKERY               | lgbm: 0.17144
CELEBRATION                | lgbm: 0.52002
CLEANING                   | lgbm: 0.31361
DAIRY                      | lgbm: 0.14556
DELI                       | lgbm: 0.17737
EGGS                       | lgbm: 0.26923
FROZEN FOODS               | lgbm: 0.26359
GROCERY I                  | lgbm: 0.15560
GROCERY II                 | lgbm: 0.55569
HARDWARE                   | lgbm: 0.51269
HOME AND KITCHEN I         | lgbm: 0.48498
HOME AND KITCHEN II        | lgbm: 0.42932
HOME APPLIANCES            | lgbm: 0.32267
HOME CARE                  | lgbm: 0.19316
LADIESWEAR                 | lgbm: 0.42013
LAWN AND GARDEN            | lgbm: 0.40750
LINGERIE                   | lgbm: 0.63060
LIQUOR,WINE,BEER           | lgbm: 0.40926
MAGAZINES  

### LightGBM.

We now use gradient-boosting decision tree (GBDT) models for actual forecasting. Some state-of-the-art options include LightGBM, XGBoost and CatBoost. We only focus on LightGBM because it appears to have faster training time. Additional hyperparameters can be specified but for simplicity, we use the default values. To stabilize performance, we perform ensemble averaging by training multiple models and aggregating the forecasts through averaging. This helps to average out the errors.

\* *We adopt the approach taken in the reference notebook to use different numbers of lags of the target series for each model.*

In [28]:
GBDT_CONFIG1 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 120,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

GBDT_CONFIG2 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 14,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 150, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG3 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 365,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 200, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG4 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 730,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 100, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

# 'lgbm' for LightGBM, 'xgb' for XGBoost, 'cat' for CatBoost
ENS_MODELS = ["lgbm", "lgbm", "lgbm", "lgbm"]
ENS_CONFIGS = [GBDT_CONFIG1, GBDT_CONFIG2, GBDT_CONFIG3, GBDT_CONFIG4]

In [29]:
# # generate forecasts for model trained on the entire data
# predictions1 = trainer.validate(
#     model_names=ENS_MODELS, 
#     model_configs=ENS_CONFIGS,
# )

# generate forecasts for model trained on a subset of the data
predictions2 = trainer.validate(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.49028 - lgbm: 0.49571 - lgbm: 0.49193 - lgbm: 0.50219 - ens: 0.48910
BABY CARE                  | lgbm: 0.20723 - lgbm: 0.20934 - lgbm: 0.20948 - lgbm: 0.20212 - ens: 0.20572
BEAUTY                     | lgbm: 0.44933 - lgbm: 0.46487 - lgbm: 0.46268 - lgbm: 0.47157 - ens: 0.45610
BEVERAGES                  | lgbm: 0.21386 - lgbm: 0.21054 - lgbm: 0.20868 - lgbm: 0.22421 - ens: 0.20709
BOOKS                      | lgbm: 0.06088 - lgbm: 0.06738 - lgbm: 0.05638 - lgbm: 0.04706 - ens: 0.05787
BREAD/BAKERY               | lgbm: 0.15873 - lgbm: 0.16465 - lgbm: 0.16426 - lgbm: 0.17364 - ens: 0.15984
CELEBRATION                | lgbm: 0.52122 - lgbm: 0.52257 - lgbm: 0.51965 - lgbm: 0.51700 - ens: 0.51408
CLEANING                   | lgbm: 0.28049 - lgbm: 0.35189 - lgbm: 0.31911 - lgbm: 0.30148 - ens: 0.30655
DAIRY                      | lgbm: 0.13595 - lgbm: 0.15258 - lgbm: 0.12816 - lgbm: 0.14865 - ens: 0.13571
DELI                       | lgbm: 0.16695 - l

In [166]:
# future covariates
future_cols = [
    "oil", "onpromotion",
    "day", "month", "year", "day_of_week", "day_of_year", "week_of_year", "date_index",
    "work_day", *selected_holidays,
]

holidays_to_add = ['N Batalla de Pichincha', 'N Carnaval', 'N Cyber Monday', 'N Independencia de Cuenca', 'N Independencia de Guayaquil', 'N Viernes Santo']

# for new_holiday in holidays_to_add:
#     future_cols.append(new_holiday)

time_based_to_add = ['day_of_month', 'is_wknd', 
       'is_year_end', 'wageday', 'day_to_nearest_holiday', 'day_from_nearest_holiday', 'is_quarter_start', 'week_of_month',
       'is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_start', 'is_month_end']

izbaceni = ['is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_start', 'is_month_end']

for time_based in time_based_to_add:
    future_cols.append(time_based)

# additional past and future covariates from computing the moving averages
past_ma_cols = None
future_ma_cols = ["oil", "onpromotion"]

past_dict, future_dict = get_covariates(past_cols, future_cols, past_ma_cols, future_ma_cols)

TRAINER_CONFIG = {
    # the time series data previously extracted
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": 16, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 21,
    
    # specify the covariates in a list to include in the model
    # set to None to not use any, and set to 'keep_all' to include everything
    "static_covs": "keep_all", # specify from ['city', 'state', 'cluster', 'type', 'store_nbr'], will extract all one-hot encoded columns
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

# initialize model trainer
trainer = Trainer(**TRAINER_CONFIG)

GBDT_CONFIG1 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 120,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

GBDT_CONFIG2 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 14,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 150, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG3 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 365,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 200, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG4 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 730,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 100, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

# 'lgbm' for LightGBM, 'xgb' for XGBoost, 'cat' for CatBoost
ENS_MODELS = ["lgbm", "lgbm", "lgbm", "lgbm"]
ENS_CONFIGS = [GBDT_CONFIG1, GBDT_CONFIG2, GBDT_CONFIG3, GBDT_CONFIG4]

# generate forecasts for model trained on the entire data
predictions1 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
)

# generate forecasts for model trained on a subset of the data
predictions2 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)

# compute the average of the ensemble models
final_predictions = predictions1.merge(
    predictions2, on=["date", "store_nbr", "family"], how="left",
)
final_predictions["sales"] = final_predictions[["sales_x", "sales_y"]].mean(axis=1)
final_predictions = final_predictions.drop(columns=["sales_x", "sales_y"])
test = pd.read_csv('originalni_datasetovi/test.csv', parse_dates=['date'])

def prepare_submission(predictions):
    predictions = predictions.copy()
    A
    # process column values for merging
    predictions.store_nbr = predictions.store_nbr.replace(
        "store_nbr_", "", regex=True,
    ).astype(int)
     
    # match with corresponding 'id'
    submission = test.merge(
        predictions, on=["date", "store_nbr", "family"], how="left",
    )[["id", "sales"]]
    
    return submission

submission = prepare_submission(final_predictions)
submission.to_csv("submission_bez_praznika.csv", index=False)

Extracting covariates:   0%|          | 0/33 [00:00<?, ?it/s]

Setting up:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

: 

In [27]:
# future covariates
selected_holidays = [
    "nat_terremoto", "nat_navidad", "nat_dia la madre", "nat_dia trabajo",
    "nat_primer dia ano", "nat_futbol", "nat_dia difuntos", "nat_batalla_de_pichincha",
    "nat_carnaval", 'nat_cyber_monday', 'nat_independencia_de_cuenca',
    'nat_independencia_de_guayaquil', 'n_viernes_santo']

future_cols = [
    "oil", "onpromotion",
    "day", "month", "year", "day_of_week", "day_of_year", "week_of_year", "date_index",
    "work_day", *selected_holidays,
]


time_based_to_add = ['day_of_month', 'is_wknd', 
       'is_year_end', 'wageday', 'day_to_nearest_holiday', 'day_from_nearest_holiday', 'is_quarter_start', 'week_of_month',
       'is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_start', 'is_month_end']

izbaceni = ['is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_start', 'is_month_end']

for time_based in time_based_to_add:
    future_cols.append(time_based)

# additional past and future covariates from computing the moving averages
past_ma_cols = None
future_ma_cols = ["oil", "onpromotion"]

past_dict, future_dict = get_covariates(past_cols, future_cols, past_ma_cols, future_ma_cols)

TRAINER_CONFIG = {
    # the time series data previously extracted
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": 16, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 21,
    
    # specify the covariates in a list to include in the model
    # set to None to not use any, and set to 'keep_all' to include everything
    "static_covs": "keep_all", # specify from ['city', 'state', 'cluster', 'type', 'store_nbr'], will extract all one-hot encoded columns
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

# initialize model trainer
trainer = Trainer(**TRAINER_CONFIG)

GBDT_CONFIG1 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 120,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
}

GBDT_CONFIG2 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 14,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
}

GBDT_CONFIG3 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 365,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    
}

GBDT_CONFIG4 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 730,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    "output_chunk_length": 1,
    
}

# 'lgbm' for LightGBM, 'xgb' for XGBoost, 'cat' for CatBoost
ENS_MODELS = ["lgbm", "lgbm", "lgbm", "lgbm"]
ENS_CONFIGS = [GBDT_CONFIG1, GBDT_CONFIG2, GBDT_CONFIG3, GBDT_CONFIG4]

# generate forecasts for model trained on the entire data
predictions1 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
)

# generate forecasts for model trained on a subset of the data
predictions2 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)

# compute the average of the ensemble models
final_predictions = predictions1.merge(
    predictions2, on=["date", "store_nbr", "family"], how="left",
)
final_predictions["sales"] = final_predictions[["sales_x", "sales_y"]].mean(axis=1)
final_predictions = final_predictions.drop(columns=["sales_x", "sales_y"])
test = pd.read_csv('originalni_datasetovi/test.csv', parse_dates=['date'])

def prepare_submission(predictions):
    predictions = predictions.copy()
    
    # process column values for merging
    predictions.store_nbr = predictions.store_nbr.replace(
        "store_nbr_", "", regex=True,
    ).astype(int)
     
    # match with corresponding 'id'
    submission = test.merge(
        predictions, on=["date", "store_nbr", "family"], how="left",
    )[["id", "sales"]]
    
    return submission

submission = prepare_submission(final_predictions)
submission.to_csv("submission_def_lgbm.csv", index=False)

Extracting covariates:   0%|          | 0/33 [00:00<?, ?it/s]

Setting up:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.414135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61803
[LightGBM] [Info] Number of data points in the train set: 84672, number of used features: 901
[LightGBM] [Info] Start training from score 0.463530
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34773
[LightGBM] [Info] Number of data points in the train set: 89964, number of used features: 795
[LightGBM] [Info] Start training from score 0.457790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.812314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 124248
[LightGBM] [Info] Number of data points in the tra

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61683
[LightGBM] [Info] Number of data points in the train set: 45252, number of used features: 886
[LightGBM] [Info] Start training from score 0.511778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047214 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34672
[LightGBM] [Info] Number of data points in the train set: 50976, number of used features: 780
[LightGBM] [Info] Start training from score 0.503837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.330276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 124254
[LightGBM] [Info] Number of data points in the tra

In [24]:
# future covariates
selected_holidays = [
    "nat_terremoto", "nat_navidad", "nat_dia la madre", "nat_dia trabajo",
    "nat_primer dia ano", "nat_futbol", "nat_dia difuntos", "nat_batalla_de_pichincha",
    "nat_carnaval", 'nat_cyber_monday', 'nat_independencia_de_cuenca',
    'nat_independencia_de_guayaquil', 'n_viernes_santo']
future_cols = [
    "oil", "onpromotion",
    "day", "month", "year", "day_of_week", "day_of_year", "week_of_year", "date_index",
    "work_day", *selected_holidays,
]


time_based_to_add = ['day_of_month', 'is_wknd', 'wageday', 'day_to_nearest_holiday','is_month_start', 'day_from_nearest_holiday', 'is_quarter_start', 'week_of_month']

izbaceni = ['is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_end', 'is_year_end']

for time_based in time_based_to_add:
    future_cols.append(time_based)

# additional past and future covariates from computing the moving averages
past_ma_cols = None
future_ma_cols = ["oil", "onpromotion"]

past_dict, future_dict = get_covariates(past_cols, future_cols, past_ma_cols, future_ma_cols)

TRAINER_CONFIG = {
    # the time series data previously extracted
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": 16, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 21,
    
    # specify the covariates in a list to include in the model
    # set to None to not use any, and set to 'keep_all' to include everything
    "static_covs": "keep_all", # specify from ['city', 'state', 'cluster', 'type', 'store_nbr'], will extract all one-hot encoded columns
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

# initialize model trainer
trainer = Trainer(**TRAINER_CONFIG)

GBDT_CONFIG1 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 120,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

GBDT_CONFIG2 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 14,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 150, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG3 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 365,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 200, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG4 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 730,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 100, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

# 'lgbm' for LightGBM, 'xgb' for XGBoost, 'cat' for CatBoost
ENS_MODELS = ["lgbm", "lgbm", "lgbm", "lgbm"]
ENS_CONFIGS = [GBDT_CONFIG1, GBDT_CONFIG2, GBDT_CONFIG3, GBDT_CONFIG4]

# generate forecasts for model trained on the entire data
predictions1 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
)

# generate forecasts for model trained on a subset of the data
predictions2 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)

# compute the average of the ensemble models
final_predictions = predictions1.merge(
    predictions2, on=["date", "store_nbr", "family"], how="left",
)
final_predictions["sales"] = final_predictions[["sales_x", "sales_y"]].mean(axis=1)
final_predictions = final_predictions.drop(columns=["sales_x", "sales_y"])
test = pd.read_csv('originalni_datasetovi/test.csv', parse_dates=['date'])

def prepare_submission(predictions):
    predictions = predictions.copy()
    
    # process column values for merging
    predictions.store_nbr = predictions.store_nbr.replace(
        "store_nbr_", "", regex=True,
    ).astype(int)
     
    # match with corresponding 'id'
    submission = test.merge(
        predictions, on=["date", "store_nbr", "family"], how="left",
    )[["id", "sales"]]
    
    return submission

submission = prepare_submission(final_predictions)
submission.to_csv("submission_bez_time_neki.csv", index=False)

Extracting covariates:   0%|          | 0/33 [00:00<?, ?it/s]

Setting up:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

In [25]:
# future covariates
selected_holidays = [
    "nat_terremoto", "nat_navidad", "nat_dia la madre", "nat_dia trabajo",
    "nat_primer dia ano", "nat_futbol", "nat_dia difuntos", "nat_batalla_de_pichincha",
    "nat_carnaval", 'nat_cyber_monday', 'nat_independencia_de_cuenca',
    'nat_independencia_de_guayaquil', 'n_viernes_santo']


future_cols = [
    "oil", "onpromotion",
    "day", "month", "year", "day_of_week", "day_of_year", "week_of_year", "date_index",
    "work_day", *selected_holidays,
]


# time_based_to_add = ['day_of_month', 'is_wknd', 'wageday', 'day_to_nearest_holiday','is_month_start', 'day_from_nearest_holiday', 'is_quarter_start', 'week_of_month']

# izbaceni = ['is_year_start', 'is_quarter_end', 'quarter', 'season', 'is_quarter_end', 'is_month_end', 'is_year_end']

# for time_based in time_based_to_add:
#     future_cols.append(time_based)

# additional past and future covariates from computing the moving averages
past_ma_cols = None
future_ma_cols = ["oil", "onpromotion"]

past_dict, future_dict = get_covariates(past_cols, future_cols, past_ma_cols, future_ma_cols)

TRAINER_CONFIG = {
    # the time series data previously extracted
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": 16, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 21,
    
    # specify the covariates in a list to include in the model
    # set to None to not use any, and set to 'keep_all' to include everything
    "static_covs": "keep_all", # specify from ['city', 'state', 'cluster', 'type', 'store_nbr'], will extract all one-hot encoded columns
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

# initialize model trainer
trainer = Trainer(**TRAINER_CONFIG)

GBDT_CONFIG1 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 120,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 300, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
}

GBDT_CONFIG2 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 14,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 150, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG3 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 365,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 200, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

GBDT_CONFIG4 = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 730,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)) if TRAINER_CONFIG["past_covs"] is not None else None,
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1) if TRAINER_CONFIG["future_covs"] is not None else None,
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
    "n_estimators": 100, # num_iterations 100
    "learning_rate": 0.05, # 0.1
    #"max_depth": -1, # -1 useful when data is small
    "subsample": 1, # 1
    "lambda_l1": 0, # l1 0
    "lambda_l2": 0, # l2  0
    "verbose":-1
    
}

# 'lgbm' for LightGBM, 'xgb' for XGBoost, 'cat' for CatBoost
ENS_MODELS = ["lgbm", "lgbm", "lgbm", "lgbm"]
ENS_CONFIGS = [GBDT_CONFIG1, GBDT_CONFIG2, GBDT_CONFIG3, GBDT_CONFIG4]

# generate forecasts for model trained on the entire data
predictions1 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
)

# generate forecasts for model trained on a subset of the data
predictions2 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)

# compute the average of the ensemble models
final_predictions = predictions1.merge(
    predictions2, on=["date", "store_nbr", "family"], how="left",
)
final_predictions["sales"] = final_predictions[["sales_x", "sales_y"]].mean(axis=1)
final_predictions = final_predictions.drop(columns=["sales_x", "sales_y"])
test = pd.read_csv('originalni_datasetovi/test.csv', parse_dates=['date'])

def prepare_submission(predictions):
    predictions = predictions.copy()
    
    # process column values for merging
    predictions.store_nbr = predictions.store_nbr.replace(
        "store_nbr_", "", regex=True,
    ).astype(int)
     
    # match with corresponding 'id'
    submission = test.merge(
        predictions, on=["date", "store_nbr", "family"], how="left",
    )[["id", "sales"]]
    
    return submission

submission = prepare_submission(final_predictions)
submission.to_csv("submission_bez_nasih.csv", index=False)

Extracting covariates:   0%|          | 0/33 [00:00<?, ?it/s]

Setting up:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

In [145]:
# compute the average of the ensemble models
final_predictions = predictions1.merge(
    predictions2, on=["date", "store_nbr", "family"], how="left",
)
final_predictions["sales"] = final_predictions[["sales_x", "sales_y"]].mean(axis=1)
final_predictions = final_predictions.drop(columns=["sales_x", "sales_y"])

final_predictions.head()

Unnamed: 0,date,store_nbr,family,sales
0,2017-08-16,store_nbr_1,AUTOMOTIVE,3.440428
1,2017-08-17,store_nbr_1,AUTOMOTIVE,3.352807
2,2017-08-18,store_nbr_1,AUTOMOTIVE,3.785472
3,2017-08-19,store_nbr_1,AUTOMOTIVE,4.44359
4,2017-08-20,store_nbr_1,AUTOMOTIVE,2.313947


### Preparing for the submission.

In [146]:
test = pd.read_csv('originalni_datasetovi/test.csv', parse_dates=['date'])

def prepare_submission(predictions):
    predictions = predictions.copy()
    
    # process column values for merging
    predictions.store_nbr = predictions.store_nbr.replace(
        "store_nbr_", "", regex=True,
    ).astype(int)
     
    # match with corresponding 'id'
    submission = test.merge(
        predictions, on=["date", "store_nbr", "family"], how="left",
    )[["id", "sales"]]
    
    return submission

In [147]:

submission = prepare_submission(final_predictions)

submission.head()

Unnamed: 0,id,sales
0,3000888,3.440428
1,3000889,0.0
2,3000890,4.219843
3,3000891,2409.537119
4,3000892,0.041166


In [148]:
submission.to_csv("submission.csv", index=False)