In [2]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
%config Completer.use_jedi = False
print(str(Path.cwd().parent))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/ubuntu/varios/skforecast


## Librerias

In [3]:
# random search forecaster
# ==============================================================================
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from skforecast.model_selection.model_selection import bayesian_search_forecaster
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom

In [4]:
# Download data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o.csv')
data = pd.read_csv(url, sep=',', header=0, names=['y', 'datetime'])

# Data preprocessing
# ==============================================================================
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y/%m/%d')
data = data.set_index('datetime')
data = data.asfreq('MS')
data = data[['y']]
data = data.sort_index()

# Train-val-test dates
# ==============================================================================
end_train = '2001-01-01 23:59:00'
end_val = '2006-01-01 23:59:00'

print(f"Train dates      : {data.index.min()} --- {data.loc[:end_train].index.max()}  (n={len(data.loc[:end_train])})")
print(f"Validation dates : {data.loc[end_train:].index.min()} --- {data.loc[:end_val].index.max()}  (n={len(data.loc[end_train:end_val])})")
print(f"Test dates       : {data.loc[end_val:].index.min()} --- {data.index.max()}  (n={len(data.loc[end_val:])})")


Train dates      : 1991-07-01 00:00:00 --- 2001-01-01 00:00:00  (n=115)
Validation dates : 2001-02-01 00:00:00 --- 2006-01-01 00:00:00  (n=60)
Test dates       : 2006-02-01 00:00:00 --- 2008-06-01 00:00:00  (n=29)


In [5]:
from typing import Union, Tuple, Optional, Any
import numpy as np
import pandas as pd
import warnings
import logging
from copy import deepcopy
from tqdm import tqdm
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler
import optuna
from optuna.samplers import TPESampler, RandomSampler
optuna.logging.set_verbosity(optuna.logging.WARNING) # disable optuna logs
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.space import Categorical, Real, Integer

from skforecast.model_selection import backtesting_forecaster
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.ForecasterAutoregMultiOutput import ForecasterAutoregMultiOutput
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries




In [8]:
# Bayesian search hyperparameter and lags with skopt
# ==============================================================================
forecaster = ForecasterAutoreg(
                regressor = RandomForestRegressor(random_state=123),
                lags      = 10 # Placeholder, the value will be overwritten
             )

# Lags used as predictors
lags_grid = [5, 3]

# Regressor hyperparameters search space
search_space = {'n_estimators'    : Integer(10, 20, "uniform", name='n_estimators'),
                'min_samples_leaf': Real(1., 3.7, "log-uniform", name='min_samples_leaf'),
                'max_features'    : Categorical(['log2', 'sqrt'], name='max_features')
                }
results_1, frozen_trial = bayesian_search_forecaster(
                            forecaster            = forecaster,
                            y                     = data.loc[:end_val, 'y'],
                            lags_grid             = lags_grid,
                            search_space          = search_space,
                            steps                 = 12,
                            metric                = 'mean_squared_error',
                            refit                 = True,
                            initial_train_size    = len(data.loc[:end_train]),
                            fixed_train_size      = True,
                            n_trials              = 10,
                            random_state          = 123,
                            return_best           = False,
                            verbose               = False,
                            engine                = 'skopt',
                            kwargs_create_study   = {},
                            kwargs_study_optimize = {}
                        )



results_2, frozen_trial = bayesian_search_forecaster(
                            forecaster            = forecaster,
                            y                     = data.loc[:end_val, 'y'],
                            lags_grid             = lags_grid,
                            search_space          = search_space,
                            steps                 = 12,
                            metric                =  ['mean_squared_error'],
                            refit                 = True,
                            initial_train_size    = len(data.loc[:end_train]),
                            fixed_train_size      = True,
                            n_trials              = 10,
                            random_state          = 123,
                            return_best           = False,
                            verbose               = False,
                            engine                = 'skopt',
                            kwargs_create_study   = {},
                            kwargs_study_optimize = {}
                        )


results_1

Number of models compared: 20,
         10 bayesian search in each lag configuration.


loop lags_grid: 100%|███████████████████████████████████████| 2/2 [00:05<00:00,  2.88s/it]


Number of models compared: 20,
         10 bayesian search in each lag configuration.


loop lags_grid: 100%|███████████████████████████████████████| 2/2 [00:05<00:00,  2.76s/it]


Unnamed: 0,lags,params,mean_squared_error,n_estimators,min_samples_leaf,max_features
19,"[1, 2, 3]","{'n_estimators': 12, 'min_samples_leaf': 3.642...",0.068957,12,3.642341,sqrt
14,"[1, 2, 3]","{'n_estimators': 12, 'min_samples_leaf': 2.481...",0.068957,12,2.481767,sqrt
13,"[1, 2, 3]","{'n_estimators': 14, 'min_samples_leaf': 2.272...",0.069073,14,2.272179,log2
12,"[1, 2, 3]","{'n_estimators': 14, 'min_samples_leaf': 2.134...",0.069073,14,2.134928,log2
18,"[1, 2, 3]","{'n_estimators': 14, 'min_samples_leaf': 2.355...",0.069073,14,2.355124,log2
16,"[1, 2, 3]","{'n_estimators': 17, 'min_samples_leaf': 1.750...",0.069218,17,1.750301,log2
11,"[1, 2, 3]","{'n_estimators': 17, 'min_samples_leaf': 1.901...",0.069218,17,1.901317,sqrt
10,"[1, 2, 3]","{'n_estimators': 17, 'min_samples_leaf': 1.751...",0.069218,17,1.751693,sqrt
17,"[1, 2, 3]","{'n_estimators': 15, 'min_samples_leaf': 2.634...",0.06943,15,2.634133,log2
15,"[1, 2, 3]","{'n_estimators': 16, 'min_samples_leaf': 1.778...",0.069501,16,1.778914,log2


In [13]:
results_1 = results_1.rename(columns={'mean_squared_error': 'metric'})
results_2 = results_2.rename(columns={'mean_squared_error': 'metric'})

results_1.lags = results_1.lags.astype(str)
results_2.lags = results_2.lags.astype(str)

cols = ['lags', 'n_estimators', 'min_samples_leaf', 'max_features', 'metric']
cols_to_sort = ['lags', 'n_estimators', 'min_samples_leaf', 'max_features']

results_1 = results_1[cols].sort_values(by=cols_to_sort).reset_index(drop=True)
results_2 = results_2[cols].sort_values(by=cols_to_sort).reset_index(drop=True)

equal_hiperparameters = (results_1[cols_to_sort] == results_2[cols_to_sort]).all(axis=1)

results_1 = results_1[equal_hiperparameters]
results_2 = results_2[equal_hiperparameters]

no_match = results_1.metric != results_2.metric
display(results_1[no_match])
display(results_2[no_match])

Unnamed: 0,lags,n_estimators,min_samples_leaf,max_features,metric


Unnamed: 0,lags,n_estimators,min_samples_leaf,max_features,metric


# Old

In [14]:
def _bayesian_search_skopt(
    forecaster,
    y: pd.Series,
    search_space: dict,
    steps: int,
    metric: Union[str, callable],
    initial_train_size: int,
    fixed_train_size: bool=True,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None,
    lags_grid: Optional[list]=None,
    refit: bool=False,
    n_trials: int=10,
    random_state: int=123,
    return_best: bool=True,
    verbose: bool=True,
    kwargs_gp_minimize: dict={}
) -> Tuple[pd.DataFrame, object]:
    """
    Bayesian optimization for a Forecaster object using time series backtesting and skopt library.
    
    Parameters
    ----------
    forecaster : ForecasterAutoreg, ForecasterAutoregCustom, ForecasterAutoregDirect, 
    ForecasterAutoregMultiOutput
        Forcaster model.
        
    y : pandas Series
        Training time series values. 
        
    search_space : dict
        Dictionary with parameters names (`str`) as keys and Space object from skopt 
        (Real, Integer, Categorical) as values.

    steps : int
        Number of steps to predict.
        
    metric : str, callable
        Metric used to quantify the goodness of fit of the model.
        
        If string:
            {'mean_squared_error', 'mean_absolute_error',
             'mean_absolute_percentage_error', 'mean_squared_log_error'}

        It callable:
            Function with arguments y_true, y_pred that returns a float.

    initial_train_size : int 
        Number of samples in the initial train split.
 
    fixed_train_size : bool, default `True`
        If True, train size doesn't increases but moves by `steps` in each iteration.

    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
           
    lags_grid : list of int, lists, np.narray or range, default `None`
        Lists of `lags` to try. Only used if forecaster is an instance of 
        `ForecasterAutoreg`, `ForecasterAutoregDirect` or `ForecasterAutoregMultiOutput`.
        
    refit : bool, default `False`
        Whether to re-fit the forecaster in each iteration of backtesting.
        
    n_trials : int, default `10`
        Number of parameter settings that are sampled in each lag configuration.

    random_state : int, default `123`
        Sets a seed to the sampling for reproducible output.

    return_best : bool, default `True`
        Refit the `forecaster` using the best found parameters on the whole data.
        
    verbose : bool, default `True`
        Print number of folds used for cv or backtesting.

    kwargs_gp_minimize : dict, default `{}`
        Other keyword arguments (key, value mappings) to pass to skopt.gp_minimize().

    Returns 
    -------
    results : pandas DataFrame
        Results for each combination of parameters.
            column lags = predictions.
            column params = lower bound of the interval.
            column metric = metric value estimated for the combination of parameters.
            additional n columns with param = value.

    results_opt_best : scipy object
        The best optimization result returned as a OptimizeResult object.
    
    """

    if isinstance(forecaster, ForecasterAutoregCustom):
        if lags_grid is not None:
            warnings.warn(
                '`lags_grid` ignored if forecaster is an instance of `ForecasterAutoregCustom`.'
            )
        lags_grid = ['custom predictors']
        
    elif lags_grid is None:
        lags_grid = [forecaster.lags]
   
    lags_list = []
    params_list = []
    metric_list = []
    results_opt_best = None

    for key in search_space.keys():
        if key != search_space[key].name:
            raise Exception(
                f"""Some of the key values do not match the Space object name from skopt.
                    {key} != {search_space[key].name}."""
            )

    search_space = list(search_space.values())

    # Objective function using backtesting_forecaster
    @use_named_args(search_space)
    def _objective(
        forecaster         = forecaster,
        y                  = y,
        exog               = exog,
        initial_train_size = initial_train_size,
        fixed_train_size   = fixed_train_size,
        steps              = steps,
        metric             = metric,
        refit              = refit,
        verbose            = verbose,
        **params
    ) -> float:
        
        forecaster.set_params(**params)
        
        metric, _ = backtesting_forecaster(
                        forecaster         = forecaster,
                        y                  = y,
                        exog               = exog,
                        steps              = steps,
                        metric             = metric,
                        initial_train_size = initial_train_size,
                        fixed_train_size   = fixed_train_size,
                        refit              = refit,
                        verbose            = verbose
                    )

        return abs(metric)

    print(
        f"""Number of models compared: {n_trials*len(lags_grid)}, {n_trials} bayesian search in each lag configuration."""
    )

    for lags in tqdm(lags_grid, desc='loop lags_grid', position=0, ncols=90):
        
        if isinstance(forecaster, (ForecasterAutoreg, ForecasterAutoregDirect, 
        ForecasterAutoregMultiOutput)):
            forecaster.set_lags(lags)
            lags = forecaster.lags.copy()
        
        results_opt = gp_minimize(
                        func         = _objective,
                        dimensions   = search_space,
                        n_calls      = n_trials,
                        random_state = random_state,
                        **kwargs_gp_minimize
                      )

        for i in range(len(results_opt.x_iters)):
            params = {param.name: results_opt.x_iters[i][j] 
                      for j, param in enumerate(search_space)}
 
            params_list.append(params)
            lags_list.append(lags)
            metric_list.append(results_opt.func_vals[i])

        if results_opt_best is None:
            results_opt_best = results_opt
        else:
            if results_opt.fun < results_opt_best.fun:
                results_opt_best = results_opt
        
    results = pd.DataFrame({
                'lags'  : lags_list,
                'params': params_list,
                'metric': metric_list
              })
    
    results = results.sort_values(by='metric', ascending=True)
    results = pd.concat([results, results['params'].apply(pd.Series)], axis=1)
    
    if return_best:
        
        best_lags = results['lags'].iloc[0]
        best_params = results['params'].iloc[0]
        best_metric = results['metric'].iloc[0]
        
        if isinstance(forecaster, (ForecasterAutoreg, ForecasterAutoregDirect, 
        ForecasterAutoregMultiOutput)):
            forecaster.set_lags(best_lags)
        forecaster.set_params(**best_params)
        forecaster.fit(y=y, exog=exog)
        
        print(
            f"`Forecaster` refitted using the best-found lags and parameters, and the whole data set: \n"
            f"  Lags: {best_lags} \n"
            f"  Parameters: {best_params}\n"
            f"  Backtesting metric: {best_metric}\n"
        )

    return results, results_opt_best

In [15]:
# Bayesian search hyperparameter and lags with skopt
# ==============================================================================
forecaster = ForecasterAutoreg(
                regressor = RandomForestRegressor(random_state=123),
                lags      = 10 # Placeholder, the value will be overwritten
             )

# Lags used as predictors
lags_grid = [5, 3]

# Regressor hyperparameters search space
search_space = {'n_estimators'    : Integer(10, 20, "uniform", name='n_estimators'),
                'min_samples_leaf': Real(1., 3.7, "log-uniform", name='min_samples_leaf'),
                'max_features'    : Categorical(['log2', 'sqrt'], name='max_features')
                }
results_1, frozen_trial = bayesian_search_forecaster(
                            forecaster            = forecaster,
                            y                     = data.loc[:end_val, 'y'],
                            lags_grid             = lags_grid,
                            search_space          = search_space,
                            steps                 = 12,
                            metric                = 'mean_squared_error',
                            refit                 = True,
                            initial_train_size    = len(data.loc[:end_train]),
                            fixed_train_size      = True,
                            n_trials              = 10,
                            random_state          = 123,
                            return_best           = False,
                            verbose               = False,
                            engine                = 'skopt',
                            kwargs_create_study   = {},
                            kwargs_study_optimize = {}
                        )

Number of models compared: 20,
         10 bayesian search in each lag configuration.


loop lags_grid: 100%|███████████████████████████████████████| 2/2 [00:05<00:00,  2.93s/it]


In [16]:
results_1

Unnamed: 0,lags,params,mean_squared_error,n_estimators,min_samples_leaf,max_features
19,"[1, 2, 3]","{'n_estimators': 12, 'min_samples_leaf': 3.642...",0.068957,12,3.642341,sqrt
14,"[1, 2, 3]","{'n_estimators': 12, 'min_samples_leaf': 2.481...",0.068957,12,2.481767,sqrt
13,"[1, 2, 3]","{'n_estimators': 14, 'min_samples_leaf': 2.272...",0.069073,14,2.272179,log2
12,"[1, 2, 3]","{'n_estimators': 14, 'min_samples_leaf': 2.134...",0.069073,14,2.134928,log2
18,"[1, 2, 3]","{'n_estimators': 14, 'min_samples_leaf': 2.355...",0.069073,14,2.355124,log2
16,"[1, 2, 3]","{'n_estimators': 17, 'min_samples_leaf': 1.750...",0.069218,17,1.750301,log2
11,"[1, 2, 3]","{'n_estimators': 17, 'min_samples_leaf': 1.901...",0.069218,17,1.901317,sqrt
10,"[1, 2, 3]","{'n_estimators': 17, 'min_samples_leaf': 1.751...",0.069218,17,1.751693,sqrt
17,"[1, 2, 3]","{'n_estimators': 15, 'min_samples_leaf': 2.634...",0.06943,15,2.634133,log2
15,"[1, 2, 3]","{'n_estimators': 16, 'min_samples_leaf': 1.778...",0.069501,16,1.778914,log2
