In [7]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [19]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries

In [9]:
from typing import Union, Tuple, Optional, Callable
import numpy as np
import pandas as pd
import warnings
import logging
from copy import deepcopy
from joblib import Parallel, delayed, cpu_count
from tqdm.auto import tqdm
import sklearn.pipeline
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_log_error,
)
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler
import optuna
from optuna.samplers import TPESampler, RandomSampler

from skforecast.exceptions import LongTrainingWarning
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.utils import check_backtesting_input
from skforecast.utils import select_n_jobs_backtesting

optuna.logging.set_verbosity(optuna.logging.WARNING) # disable optuna logs

logging.basicConfig(
    format = '%(name)-10s %(levelname)-5s %(message)s', 
    level  = logging.INFO,
)

In [10]:
# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/simulated_items_sales.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data.head()

# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


In [11]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


## Functions

In [83]:
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries

def _bayesian_search_optuna_multiseries(
    forecaster,
    series: pd.DataFrame,
    search_space: Callable,
    steps: int,
    metric: Union[str, Callable, list],
    initial_train_size: int,
    fixed_train_size: bool=True,
    gap: int=0,
    allow_incomplete_fold: bool=True,
    levels: Optional[Union[str, list]]=None,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None,
    lags_grid: Optional[list]=None,
    refit: Optional[Union[bool, int]]=False,
    n_trials: int=10,
    random_state: int=123,
    return_best: bool=True,
    n_jobs: Optional[Union[int, str]]='auto',
    verbose: bool=True,
    show_progress: bool=True,
    kwargs_create_study: dict={},
    kwargs_study_optimize: dict={}
) -> Tuple[pd.DataFrame, object]:
    """
    """

    if return_best and exog is not None and (len(exog) != len(series)):
        raise ValueError(
            (f"`exog` must have same number of samples as `series`. "
             f"length `exog`: ({len(exog)}), length `series`: ({len(series)})")
        )

    if type(forecaster).__name__ in ['ForecasterAutoregMultiSeries', 
                                     'ForecasterAutoregMultiSeriesCustom']  \
        and levels is not None and not isinstance(levels, (str, list)):
        raise TypeError(
            ("`levels` must be a `list` of column names, a `str` of a column "
             "name or `None`.")
        )

    if type(forecaster).__name__ == 'ForecasterAutoregMultiVariate':
        if levels and levels != forecaster.level and levels != [forecaster.level]:
            warnings.warn(
                (f"`levels` argument have no use when the forecaster is of type "
                 f"ForecasterAutoregMultiVariate. The level of this forecaster "
                 f"is {forecaster.level}, to predict another level, change "
                 f"the `level` argument when initializing the forecaster. \n"),
                 IgnoredArgumentWarning
            )
        levels = [forecaster.level]
    else:
        if levels is None:
            # Forecaster can be not fitted, so cannot use self.series_col_names
            levels = list(series.columns) 
        elif isinstance(levels, str):
            levels = [levels]

    if type(forecaster).__name__ == 'ForecasterAutoregMultiSeriesCustom':
        if lags_grid is not None:
            warnings.warn(
                "`lags_grid` ignored if forecaster is an instance of `ForecasterAutoregMultiSeriesCustom`.",
                IgnoredArgumentWarning
            )
        lags_grid = ['custom predictors']
        
    elif lags_grid is None:
        lags_grid = [forecaster.lags]
   
    lags_list = []
    params_list = []
    results_opt_best = None
    if not isinstance(metric, list):
        metric = [metric] 
    metric_dict = {(m if isinstance(m, str) else m.__name__): [] 
                   for m in metric}
    
    if len(metric_dict) != len(metric):
        raise ValueError(
            "When `metric` is a `list`, each metric name must be unique."
        )

    # Objective function using backtesting_forecaster
    def _objective(
        trial,
        search_space          = search_space,
        forecaster            = forecaster,
        series                = series,
        exog                  = exog,
        steps                 = steps,
        levels                = levels,
        metric                = metric,
        initial_train_size    = initial_train_size,
        fixed_train_size      = fixed_train_size,
        gap                   = gap,
        allow_incomplete_fold = allow_incomplete_fold,
        refit                 = refit,
        n_jobs                = n_jobs,
        verbose               = verbose
    ) -> float:
        
        forecaster.set_params(search_space(trial))
        
        metrics_levels = backtesting_forecaster_multiseries(
                             forecaster            = forecaster,
                             series                = series,
                             exog                  = exog,
                             steps                 = steps,
                             levels                = levels,
                             metric                = metric,
                             initial_train_size    = initial_train_size,
                             fixed_train_size      = fixed_train_size,
                             gap                   = gap,
                             allow_incomplete_fold = allow_incomplete_fold,
                             refit                 = refit,
                             n_jobs                = n_jobs,
                             verbose               = verbose,
                             show_progress         = False
                         )[0]
        # Store metrics in the variable metric_values defined outside _objective.
        nonlocal metric_values
        metric_values.append(metrics_levels)

        return abs(metrics_levels.iloc[:, 1].mean())

    print(
        f"""Number of models compared: {n_trials*len(lags_grid)},
         {n_trials} bayesian search in each lag configuration."""
    )

    if show_progress:
        lags_grid = tqdm(lags_grid, desc='lags grid', position=0)

    for lags in lags_grid:
        metric_values = [] # This variable will be modified inside _objective function. 
        # It is a trick to extract multiple values from _objective function since
        # only the optimized value can be returned.

        if type(forecaster).__name__ != 'ForecasterAutoregMultiSeriesCustom':
            forecaster.set_lags(lags)
            lags = forecaster.lags.copy()
        
        if 'sampler' in kwargs_create_study.keys():
            kwargs_create_study['sampler']._rng = np.random.RandomState(random_state)
            kwargs_create_study['sampler']._random_sampler = RandomSampler(seed=random_state)

        study = optuna.create_study(**kwargs_create_study)

        if 'sampler' not in kwargs_create_study.keys():
            study.sampler = TPESampler(seed=random_state)

        study.optimize(_objective, n_trials=n_trials, **kwargs_study_optimize)

        best_trial = study.best_trial

        if search_space(best_trial).keys() != best_trial.params.keys():
            raise ValueError(
                f"""Some of the key values do not match the search_space key names.
                Dict keys     : {list(search_space(best_trial).keys())}
                Trial objects : {list(best_trial.params.keys())}."""
            )
        
        for i, trial in enumerate(study.get_trials()):
            params_list.append(trial.params)
            lags_list.append(lags)

            m_values = metric_values[i]
            for m in metric:
                m_name = m if isinstance(m, str) else m.__name__
                metric_dict[m_name].append(m_values[m_name].mean())
        
        if results_opt_best is None:
            results_opt_best = best_trial
        else:
            if best_trial.value < results_opt_best.value:
                results_opt_best = best_trial
        
        print(metric_dict)

    results = pd.DataFrame({
                  'levels': [levels]*len(lags_list),
                  'lags'  : lags_list,
                  'params': params_list,
                  **metric_dict
              })

    results = results.sort_values(by=list(metric_dict.keys())[0], ascending=True)
    results = pd.concat([results, results['params'].apply(pd.Series)], axis=1)
    
    if return_best:
        
        best_lags = results['lags'].iloc[0]
        best_params = results['params'].iloc[0]
        best_metric = results[list(metric_dict.keys())[0]].iloc[0]
        
        if type(forecaster).__name__ != 'ForecasterAutoregMultiSeriesCustom':
            forecaster.set_lags(best_lags)
        forecaster.set_params(best_params)
        forecaster.fit(series=series, exog=exog, store_in_sample_residuals=True)
        
        print(
            f"`Forecaster` refitted using the best-found lags and parameters, "
            f"and the whole data set: \n"
            f"  Lags: {best_lags} \n"
            f"  Parameters: {best_params}\n"
            f"  Backtesting metric: {best_metric}\n"
            f"  Levels: {results['levels'].iloc[0]}\n"
        )
            
    return results, results_opt_best

## Tests

In [80]:
# Create Forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = RandomForestRegressor(random_state=123),
                 lags               = 2,
                 transformer_series = StandardScaler(),
             )

In [84]:
# Grid search Multi Series
# ==============================================================================
lags_grid = [2, 4]

# Regressor hyperparameters search space
def search_space(trial):
    search_space  = {
        'n_estimators'     : trial.suggest_int('n_estimators', 10, 15),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1., 3),
        'max_features'     : trial.suggest_categorical('max_features', ['log2', 'sqrt'])
    } 
    return search_space

levels = ['item_1', 'item_2', 'item_3']

results, _ = _bayesian_search_optuna_multiseries(
              forecaster         = forecaster,
              series             = data,
              exog               = None,
              levels             = levels, 
              lags_grid          = lags_grid,
              search_space       = search_space,
              steps              = 24,
              metric             = ['mean_absolute_error', 'mean_squared_error'],
              initial_train_size = len(data_train),
              n_trials           = 3,
              refit              = True,
              fixed_train_size   = True,
              return_best        = False,
              n_jobs             = 'auto',
              verbose            = False,
              show_progress      = True,
              kwargs_create_study   = {},
              kwargs_study_optimize = {}
          )

Number of models compared: 6,
         3 bayesian search in each lag configuration.


lags grid:   0%|          | 0/2 [00:00<?, ?it/s]

{'mean_absolute_error': [3.22344942620262, 3.6977824960678594, 3.467055410479105], 'mean_squared_error': [17.84917034121136, 23.649984681614825, 20.528829838529116]}
{'mean_absolute_error': [3.22344942620262, 3.6977824960678594, 3.467055410479105, 3.001566882551417, 2.8922785933104365, 3.2342470300487243], 'mean_squared_error': [17.84917034121136, 23.649984681614825, 20.528829838529116, 18.234133437451117, 15.190501124742214, 18.321157453862853]}


In [82]:
results

Unnamed: 0,levels,lags,params,mean_absolute_error,mean_squared_error,n_estimators,min_samples_leaf,max_features
4,"[item_1, item_2, item_3]","[1, 2, 3, 4]","{'n_estimators': 14, 'min_samples_leaf': 2, 'm...",2.892279,15.190501,14,2,log2
3,"[item_1, item_2, item_3]","[1, 2, 3, 4]","{'n_estimators': 14, 'min_samples_leaf': 1, 'm...",3.001567,18.234133,14,1,sqrt
0,"[item_1, item_2, item_3]","[1, 2]","{'n_estimators': 14, 'min_samples_leaf': 1, 'm...",3.223449,17.84917,14,1,sqrt
5,"[item_1, item_2, item_3]","[1, 2, 3, 4]","{'n_estimators': 12, 'min_samples_leaf': 2, 'm...",3.234247,18.321157,12,2,sqrt
2,"[item_1, item_2, item_3]","[1, 2]","{'n_estimators': 12, 'min_samples_leaf': 2, 'm...",3.467055,20.52883,12,2,sqrt
1,"[item_1, item_2, item_3]","[1, 2]","{'n_estimators': 14, 'min_samples_leaf': 2, 'm...",3.697782,23.649985,14,2,log2


In [33]:
print(metric)
print(metric_values)

['mean_absolute_error']
[   levels  mean_absolute_error
0  item_1                    1
1  item_2                    2
2  item_3                    3]


In [39]:
for m_values in metric_values:
    print(m_values)

   levels  mean_absolute_error
0  item_1                    1
1  item_2                    2
2  item_3                    3


In [68]:
metric = ['mean_absolute_error', 'mean_squared_error']
metric_values = [pd.DataFrame({'levels': ['item_1', 'item_2', 'item_3'],
                                'mean_absolute_error': [1, 2, 3], 
                               'mean_squared_error': [4, 5, 6]})]

In [72]:
metric_values[0]

Unnamed: 0,levels,mean_absolute_error,mean_squared_error
0,item_1,1,4
1,item_2,2,5
2,item_3,3,6


In [74]:
metric_values[0]

Unnamed: 0,levels,mean_absolute_error,mean_squared_error
0,item_1,1,4
1,item_2,2,5
2,item_3,3,6


In [70]:
for m, m_values in zip(metric, metric_values[0]):
    m_name = m if isinstance(m, str) else m.__name__
    
    print(m_name)
    print(m_values)

mean_absolute_error
levels
mean_squared_error
mean_absolute_error
