In [15]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
%config Completer.use_jedi = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
(Path.cwd().parent)

PosixPath('/home/ximo/Documents/GitHub/skforecast')

In [17]:
from typing import Union, Tuple, Optional, Any
import numpy as np
import pandas as pd
import warnings
import logging
from copy import deepcopy
from tqdm import tqdm
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler
from sklearn.exceptions import NotFittedError
import optuna
from optuna.samplers import TPESampler, RandomSampler
optuna.logging.set_verbosity(optuna.logging.WARNING) # disable optuna logs
from skopt.utils import use_named_args
from skopt import gp_minimize

logging.basicConfig(
    format = '%(name)-10s %(levelname)-5s %(message)s', 
    level  = logging.INFO,
)

In [18]:
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from pmdarima.arima import ARIMA

In [19]:
# Data download
# ==============================================================================
url = 'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o_exog.csv'
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data = data.rename(columns={'fecha': 'date'})
data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.set_index('date')
data = data.rename(columns={'x': 'y'})
data = data.asfreq('MS')
data = data.sort_index()
display(data.head())

# Split data into train-test
# ==============================================================================
steps = 36
data_train = data[:-steps]
data_test  = data[-steps:]

print(f"Train dates : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")

Unnamed: 0_level_0,y,exog_1,exog_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1992-04-01,0.379808,0.958792,1.166029
1992-05-01,0.361801,0.951993,1.117859
1992-06-01,0.410534,0.952955,1.067942
1992-07-01,0.483389,0.958078,1.097376
1992-08-01,0.475463,0.95637,1.122199


Train dates : 1992-04-01 00:00:00 --- 2005-06-01 00:00:00  (n=159)
Test dates  : 2005-07-01 00:00:00 --- 2008-06-01 00:00:00  (n=36)


In [20]:
# pmdarima SARIMAX
# =====================================================================================
forecaster = ForecasterSarimax(
                regressor = ARIMA(order=(1,1,5), seasonal_order=(0, 0, 0, 0), trend=None, with_intercept=False, maxiter=1000)
            )
forecaster


ForecasterSarimax 
Regressor:  ARIMA(1,1,5)(0,0,0)[0]           
Window size: 5 
Transformer for y: None 
Transformer for exog: None 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: None 
Training index type: None 
Training index frequency: None 
Creation date: 2022-11-24 22:38:56 
Last fit date: None 
Skforecast version: 0.6.0 
Python version: 3.9.13 

In [21]:
metric, predictions = backtesting_sarimax(
    forecaster = forecaster,
    y = data['y'],
    steps=5,
    metric = 'mean_squared_error',
    initial_train_size=len(data_train),
    refit=False,
    verbose = True
)

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n

In [23]:

param_grid = {'order': [(12, 0, 0), (12, 2, 0)],
             'seasonal_order': [(0, 0, 0, 0)],
             'trend': [None, 'n', 'c']}

grid_search_sarimax(
    forecaster = forecaster,
    y = data['y'],
    steps=5,
    param_grid = param_grid,
    metric = 'mean_squared_error',
    initial_train_size=len(data_train),
    refit=False,
    verbose = True
)

Number of models compared: 6.


  warn('Non-stationary starting autoregressive parameters'


Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n

  warn('Non-stationary starting autoregressive parameters'
loop param_grid:  33%|████████████▋                         | 2/6 [00:14<00:30,  7.52s/it]

Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n

  warn('Non-stationary starting autoregressive parameters'
loop param_grid:  50%|███████████████████                   | 3/6 [00:22<00:22,  7.58s/it]

Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n

  warn('Non-stationary starting autoregressive parameters'


Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n

  warn('Non-stationary starting autoregressive parameters'
loop param_grid:  83%|███████████████████████████████▋      | 5/6 [00:35<00:06,  6.88s/it]

Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n

  warn('Non-stationary starting autoregressive parameters'
loop param_grid: 100%|██████████████████████████████████████| 6/6 [00:43<00:00,  7.26s/it]

Information of backtesting process
----------------------------------
Number of observations used for initial training: 159
Number of observations used for backtesting: 36
    Number of folds: 8
    Number of steps per fold: 5
    Last fold only includes 1 observations.

Data partition in fold: 0
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-07-01 00:00:00 -- 2005-11-01 00:00:00  (n=5)
Data partition in fold: 1
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2005-12-01 00:00:00 -- 2006-04-01 00:00:00  (n=5)
Data partition in fold: 2
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-05-01 00:00:00 -- 2006-09-01 00:00:00  (n=5)
Data partition in fold: 3
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n=159)
    Validation: 2006-10-01 00:00:00 -- 2007-02-01 00:00:00  (n=5)
Data partition in fold: 4
    Training:   1992-04-01 00:00:00 -- 2005-06-01 00:00:00  (n


  warn('Non-stationary starting autoregressive parameters'


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Parameters: {'order': (12, 0, 0), 'seasonal_order': (0, 0, 0, 0), 'trend': 'c'}
  Backtesting metric: 0.05178852265752502



Unnamed: 0,params,mean_squared_error,order,seasonal_order,trend
2,"{'order': (12, 0, 0), 'seasonal_order': (0, 0,...",0.051789,"(12, 0, 0)","(0, 0, 0, 0)",c
0,"{'order': (12, 0, 0), 'seasonal_order': (0, 0,...",0.059239,"(12, 0, 0)","(0, 0, 0, 0)",
1,"{'order': (12, 0, 0), 'seasonal_order': (0, 0,...",0.059239,"(12, 0, 0)","(0, 0, 0, 0)",n
5,"{'order': (12, 2, 0), 'seasonal_order': (0, 0,...",0.247927,"(12, 2, 0)","(0, 0, 0, 0)",c
3,"{'order': (12, 2, 0), 'seasonal_order': (0, 0,...",0.247932,"(12, 2, 0)","(0, 0, 0, 0)",
4,"{'order': (12, 2, 0), 'seasonal_order': (0, 0,...",0.247932,"(12, 2, 0)","(0, 0, 0, 0)",n
