In [27]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/ubuntu/varios/skforecast'

In [28]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries

In [29]:
# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/simulated_items_sales.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data.head()

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [30]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


In [31]:
# Create Forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = Ridge(random_state=123),
                 lags               = 24,
                 transformer_series = StandardScaler(),
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None
             )

In [32]:
# Grid search Multi Series
# ==============================================================================
levels = ['item_1', 'item_2', 'item_3']

def search_space(trial):
    search_space  = {
        'alpha' : trial.suggest_float('alpha', 0.01, 1),
        'lags' : trial.suggest_categorical('lags', [24, 48])
    } 
    return search_space

results, best_trial = bayesian_search_forecaster_multiseries(
              forecaster         = forecaster,
              series             = data,
              exog               = None,
              levels             = levels, # Same as levels=None
              search_space       = search_space,
              steps              = 24,
              metric             = 'mean_absolute_error',
              initial_train_size = len(data_train),
              refit              = True,
              fixed_train_size   = True,
              return_best        = True,
              n_jobs             = 'auto',
              verbose            = False,
              show_progress      = True
          )

results

  0%|          | 0/10 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48] 
  Parameters: {'alpha': 0.7252189487445193}
  Backtesting metric: 2.2076640698475622
  Levels: ['item_1', 'item_2', 'item_3']



Unnamed: 0,levels,lags,params,mean_absolute_error,alpha
8,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.7252189487445193},2.207664,0.725219
6,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.53623586010342},2.207675,0.536236
4,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.4441865222328282},2.20768,0.444187
3,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.398196343012209},2.207683,0.398196
9,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.23598059857016607},2.207693,0.235981
2,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.9809565564007693},2.335041,0.980957
7,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.8509374761370117},2.335057,0.850937
5,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.7406154516747153},2.335071,0.740615
0,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.6995044937418831},2.335076,0.699504
1,"[item_1, item_2, item_3]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",{'alpha': 0.5558016213920624},2.335093,0.555802


In [144]:
import re
import pytest
from sklearn.linear_model import Ridge
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries.model_selection_multiseries import _initialize_levels_model_selection_multiseries
from skforecast.exceptions import IgnoredArgumentWarning
from typing import Union, Tuple, Optional, Callable, Any
import numpy as np
import pandas as pd
import warnings
import logging
import os
from copy import deepcopy
from joblib import Parallel, delayed, cpu_count
from tqdm.auto import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler
import optuna
from optuna.samplers import TPESampler
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import backtesting_forecaster_multivariate
import os
import re
import sys
import pytest
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries.model_selection_multiseries import _bayesian_search_optuna_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multivariate
import optuna
from optuna.samplers import TPESampler
from tqdm import tqdm
from functools import partialmethod

import numpy as np
import pandas as pd

# Fixtures
# series_1 = np.random.rand(50)
# series_2 = np.random.rand(50)
series = pd.DataFrame({'l1': pd.Series(np.array(
                                 [0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897,
                                  0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752,
                                  0.34317802, 0.72904971, 0.43857224, 0.0596779 , 0.39804426,
                                  0.73799541, 0.18249173, 0.17545176, 0.53155137, 0.53182759,
                                  0.63440096, 0.84943179, 0.72445532, 0.61102351, 0.72244338,
                                  0.32295891, 0.36178866, 0.22826323, 0.29371405, 0.63097612,
                                  0.09210494, 0.43370117, 0.43086276, 0.4936851 , 0.42583029,
                                  0.31226122, 0.42635131, 0.89338916, 0.94416002, 0.50183668,
                                  0.62395295, 0.1156184 , 0.31728548, 0.41482621, 0.86630916,
                                  0.25045537, 0.48303426, 0.98555979, 0.51948512, 0.61289453]
                                       )
                             ), 
                       'l2': pd.Series(np.array(
                                 [0.12062867, 0.8263408 , 0.60306013, 0.54506801, 0.34276383,
                                  0.30412079, 0.41702221, 0.68130077, 0.87545684, 0.51042234,
                                  0.66931378, 0.58593655, 0.6249035 , 0.67468905, 0.84234244,
                                  0.08319499, 0.76368284, 0.24366637, 0.19422296, 0.57245696,
                                  0.09571252, 0.88532683, 0.62724897, 0.72341636, 0.01612921,
                                  0.59443188, 0.55678519, 0.15895964, 0.15307052, 0.69552953,
                                  0.31876643, 0.6919703 , 0.55438325, 0.38895057, 0.92513249,
                                  0.84167   , 0.35739757, 0.04359146, 0.30476807, 0.39818568,
                                  0.70495883, 0.99535848, 0.35591487, 0.76254781, 0.59317692,
                                  0.6917018 , 0.15112745, 0.39887629, 0.2408559 , 0.34345601]
                                       )
                             )
         })

def create_predictors(y): # pragma: no cover
    """
    Create first 4 lags of a time series.
    """

    lags = y[-1:-5:-1]

    return lags

In [159]:
def test_results_output_bayesian_search_forecaster_multivariate_optuna_engine_ForecasterAutoregMultiVariate():
    """
    Test output of bayesian_search_forecaster_multivariate in 
    ForecasterAutoregMultiVariate with mocked using 
    optuna engine (mocked done in Skforecast v0.12.0).
    """
    forecaster = ForecasterAutoregMultiVariate(
                     regressor = Ridge(random_state=123),
                     level     = 'l1',
                     lags      = 2,
                     steps     = 3
                 )

    steps = 3
    n_validation = 12

    def search_space(trial):
        search_space  = {
            'alpha' : trial.suggest_float('alpha', 1e-2, 1.0),
            'lags'  : trial.suggest_categorical('lags', [2, {'l1': 4, 'l2': [2, 3]}])
            }

        return search_space

    results = bayesian_search_forecaster_multivariate(
                  forecaster         = forecaster,
                  series             = series,
                  steps              = steps,
                  search_space       = search_space,
                  metric             = 'mean_absolute_error',
                  refit              = False,
                  initial_train_size = len(series) - n_validation,
                  n_trials           = 10,
                  random_state       = 123,
                  return_best        = False,
                  verbose            = False,
                  engine             = 'optuna'
              )[0]
    
    expected_results = pd.DataFrame(
        np.array([[list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
            {'alpha': 0.23598059857016607}, 0.19308110319514993,
            0.23598059857016607],
        [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
            {'alpha': 0.398196343012209}, 0.1931744420708601,
            0.398196343012209],
        [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
            {'alpha': 0.4441865222328282}, 0.1932004954044704,
            0.4441865222328282],
        [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
            {'alpha': 0.53623586010342}, 0.19325210858832276,
            0.53623586010342],
        [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
            {'alpha': 0.7252189487445193}, 0.19335589494249983,
            0.7252189487445193],
        [list(['l1']), np.array([1, 2]), {'alpha': 0.5558016213920624},
            0.20131081099888368, 0.5558016213920624],
        [list(['l1']), np.array([1, 2]), {'alpha': 0.6995044937418831},
            0.2013710017368262, 0.6995044937418831],
        [list(['l1']), np.array([1, 2]), {'alpha': 0.7406154516747153},
            0.2013880862681147, 0.7406154516747153],
        [list(['l1']), np.array([1, 2]), {'alpha': 0.8509374761370117},
            0.20143363961627603, 0.8509374761370117],
        [list(['l1']), np.array([1, 2]), {'alpha': 0.9809565564007693},
            0.20148678375852938, 0.9809565564007693]], dtype=object),
        columns=['levels', 'lags', 'params', 'mean_absolute_error', 'alpha'],
        index=pd.Index([9, 3, 4, 6, 8, 1, 0, 5, 7, 2], dtype='int64')
    )

    expected_results['mean_absolute_error'] = expected_results['mean_absolute_error'].astype(float)
    expected_results['alpha'] = expected_results['alpha'].astype(float)

    pd.testing.assert_frame_equal(results, expected_results)
    
test_results_output_bayesian_search_forecaster_multivariate_optuna_engine_ForecasterAutoregMultiVariate()



  0%|          | 0/10 [00:00<?, ?it/s]



In [149]:
forecaster = ForecasterAutoregMultiVariate(
                    regressor = Ridge(random_state=123),
                    level     = 'l1',
                    lags      = 2,
                    steps     = 3
                )

steps = 3
n_validation = 12

def search_space(trial):
    search_space  = {
        'alpha' : trial.suggest_float('alpha', 1e-2, 1.0),
        'lags'  : trial.suggest_categorical('lags', [2, {'l1': 4, 'l2': [2, 3]}])
        }

    return search_space

results = bayesian_search_forecaster_multivariate(
                forecaster         = forecaster,
                series             = series,
                steps              = steps,
                search_space       = search_space,
                metric             = 'mean_absolute_error',
                refit              = False,
                initial_train_size = len(series) - n_validation,
                n_trials           = 10,
                random_state       = 123,
                return_best        = False,
                verbose            = False,
                engine             = 'optuna'
            )[0]

results



  0%|          | 0/10 [00:00<?, ?it/s]



Unnamed: 0,levels,lags,params,mean_absolute_error,alpha
9,[l1],"{'l1': [1, 2, 3, 4], 'l2': [2, 3]}",{'alpha': 0.23598059857016607},0.193081,0.235981
3,[l1],"{'l1': [1, 2, 3, 4], 'l2': [2, 3]}",{'alpha': 0.398196343012209},0.193174,0.398196
4,[l1],"{'l1': [1, 2, 3, 4], 'l2': [2, 3]}",{'alpha': 0.4441865222328282},0.1932,0.444187
6,[l1],"{'l1': [1, 2, 3, 4], 'l2': [2, 3]}",{'alpha': 0.53623586010342},0.193252,0.536236
8,[l1],"{'l1': [1, 2, 3, 4], 'l2': [2, 3]}",{'alpha': 0.7252189487445193},0.193356,0.725219
1,[l1],"[1, 2]",{'alpha': 0.5558016213920624},0.201311,0.555802
0,[l1],"[1, 2]",{'alpha': 0.6995044937418831},0.201371,0.699504
5,[l1],"[1, 2]",{'alpha': 0.7406154516747153},0.201388,0.740615
7,[l1],"[1, 2]",{'alpha': 0.8509374761370117},0.201434,0.850937
2,[l1],"[1, 2]",{'alpha': 0.9809565564007693},0.201487,0.980957


In [150]:
results.to_numpy()

array([[list(['l1']), {'l1': array([1, 2, 3, 4]), 'l2': array([2, 3])},
        {'alpha': 0.23598059857016607}, 0.19308110319514993,
        0.23598059857016607],
       [list(['l1']), {'l1': array([1, 2, 3, 4]), 'l2': array([2, 3])},
        {'alpha': 0.398196343012209}, 0.1931744420708601,
        0.398196343012209],
       [list(['l1']), {'l1': array([1, 2, 3, 4]), 'l2': array([2, 3])},
        {'alpha': 0.4441865222328282}, 0.1932004954044704,
        0.4441865222328282],
       [list(['l1']), {'l1': array([1, 2, 3, 4]), 'l2': array([2, 3])},
        {'alpha': 0.53623586010342}, 0.19325210858832276,
        0.53623586010342],
       [list(['l1']), {'l1': array([1, 2, 3, 4]), 'l2': array([2, 3])},
        {'alpha': 0.7252189487445193}, 0.19335589494249983,
        0.7252189487445193],
       [list(['l1']), array([1, 2]), {'alpha': 0.5558016213920624},
        0.20131081099888368, 0.5558016213920624],
       [list(['l1']), array([1, 2]), {'alpha': 0.6995044937418831},
        0.20137

In [151]:
results.columns

Index(['levels', 'lags', 'params', 'mean_absolute_error', 'alpha'], dtype='object')

In [152]:
results.index

Index([9, 3, 4, 6, 8, 1, 0, 5, 7, 2], dtype='int64')

In [155]:
expected_results = pd.DataFrame(
    np.array([[list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
        {'alpha': 0.23598059857016607}, 0.19308110319514993,
        0.23598059857016607],
       [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
        {'alpha': 0.398196343012209}, 0.1931744420708601,
        0.398196343012209],
       [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
        {'alpha': 0.4441865222328282}, 0.1932004954044704,
        0.4441865222328282],
       [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
        {'alpha': 0.53623586010342}, 0.19325210858832276,
        0.53623586010342],
       [list(['l1']), {'l1': np.array([1, 2, 3, 4]), 'l2': np.array([2, 3])},
        {'alpha': 0.7252189487445193}, 0.19335589494249983,
        0.7252189487445193],
       [list(['l1']), np.array([1, 2]), {'alpha': 0.5558016213920624},
        0.20131081099888368, 0.5558016213920624],
       [list(['l1']), np.array([1, 2]), {'alpha': 0.6995044937418831},
        0.2013710017368262, 0.6995044937418831],
       [list(['l1']), np.array([1, 2]), {'alpha': 0.7406154516747153},
        0.2013880862681147, 0.7406154516747153],
       [list(['l1']), np.array([1, 2]), {'alpha': 0.8509374761370117},
        0.20143363961627603, 0.8509374761370117],
       [list(['l1']), np.array([1, 2]), {'alpha': 0.9809565564007693},
        0.20148678375852938, 0.9809565564007693]], dtype=object),
    columns=['levels', 'lags', 'params', 'mean_absolute_error', 'alpha'],
    index=pd.Index([9, 3, 4, 6, 8, 1, 0, 5, 7, 2], dtype='int64')
)

expected_results['mean_absolute_error'] = expected_results['mean_absolute_error'].astype(float)
expected_results['alpha'] = expected_results['alpha'].astype(float)
    
pd.testing.assert_frame_equal(results, expected_results, check_dtype=True)