In [10]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/ubuntu/varios/skforecast'

In [11]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import random_search_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries

In [12]:
# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/guangyuan_air_pollution.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data = data[['CO', 'SO2', 'PM2.5']]
data.head()

Unnamed: 0_level_0,CO,SO2,PM2.5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-03-01,9600.0,204.0,181.0
2013-03-02,20198.0,674.0,633.0
2013-03-03,47195.0,1661.0,1956.0
2013-03-04,15000.0,485.0,438.0
2013-03-05,59594.0,2001.0,3388.0


In [13]:
# Split data into train-val-test
# ==============================================================================
end_train = '2016-05-31 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}"
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}"
    f"(n={len(data_test)})"
)

Train dates : 2013-03-01 00:00:00 --- 2016-05-31 00:00:00(n=1188)
Test dates  : 2016-06-01 00:00:00 --- 2017-02-28 00:00:00(n=273)


In [14]:

# Create and fit forecaster MultiVariate
# ==============================================================================
forecaster = ForecasterAutoregMultiVariate(
                 regressor          = Ridge(random_state=123),
                 level              = 'CO',
                 lags               = 7,
                 steps              = 7,
                 transformer_series = StandardScaler(),
                 transformer_exog   = None,
                 weight_func        = None,
                 n_jobs             = 'auto'
             )

forecaster.fit(series=data_train)
forecaster

ForecasterAutoregMultiVariate 
Regressor: Ridge(random_state=123) 
Lags: [1 2 3 4 5 6 7] 
Transformer for series: StandardScaler() 
Transformer for exog: None 
Weight function included: False 
Window size: 7 
Target series, level: CO 
Multivariate series (names): ['CO', 'SO2', 'PM2.5'] 
Maximum steps predicted: 7 
Exogenous included: False 
Exogenous variables names: None 
Training range: [Timestamp('2013-03-01 00:00:00'), Timestamp('2016-05-31 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-19 15:43:55 
Last fit date: 2024-07-19 15:43:55 
Skforecast version: 0.13.0 
Python version: 3.11.9 
Forecaster id: None 

In [17]:
# Backtesting MultiVariate
# ==============================================================================
metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
                                           forecaster            = forecaster,
                                           series                = data,
                                           steps                 = 7,
                                           metric                = 'mean_absolute_error',
                                           initial_train_size    = len(data_train),
                                           fixed_train_size      = False,
                                           gap                   = 0,
                                           allow_incomplete_fold = True,
                                           refit                 = False,
                                           n_jobs                = 'auto',
                                           verbose               = False,
                                           show_progress         = True
                                       )

print("Backtest metrics")
display(metrics_levels)
print("")
print("Backtest predictions")
backtest_predictions.head(4)

  0%|          | 0/39 [00:00<?, ?it/s]

Backtest metrics


Unnamed: 0,levels,mean_absolute_error
0,CO,14933.429818



Backtest predictions


Unnamed: 0,CO
2016-06-01,20240.56993
2016-06-02,23299.549916
2016-06-03,22486.173088
2016-06-04,23116.36606


In [9]:

# Create and forecaster MultiVariate
# ==============================================================================
forecaster = ForecasterAutoregMultiVariate(
                 regressor          = RandomForestRegressor(random_state=123),
                 level              = 'CO',
                 lags               = 7,
                 steps              = 7,
                 transformer_series = StandardScaler(),
                 transformer_exog   = None,
                 weight_func        = None
             )
# Random search MultiVariate
# ==============================================================================
lags_grid = [7, 14]
param_distributions = {
    'n_estimators': np.arange(start=10, stop=20, step=1, dtype=int),
    'max_depth': np.arange(start=3, stop=6, step=1, dtype=int)
}

results = random_search_forecaster_multiseries(
              forecaster            = forecaster,
              series                = data,
              exog                  = None,
              lags_grid             = lags_grid,
              param_distributions   = param_distributions,
              steps                 = 7,
              metric                = 'mean_absolute_error',
              initial_train_size    = len(data_train),
              fixed_train_size      = False,
              gap                   = 0,
              allow_incomplete_fold = True,
              refit                 = False,
              n_iter                = 5,
              return_best           = False,
              n_jobs                = 'auto',
              verbose               = False,
              show_progress         = True
          )

10 models compared for 1 level(s). Number of iterations: 10.


lags grid:   0%|          | 0/2 [00:00<?, ?it/s]

params grid:   0%|          | 0/5 [00:00<?, ?it/s]

ValueError: 3 columns passed, passed data had 0 columns