In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
from typing import Union, Dict, List, Tuple, Any, Optional
import warnings
import logging
import sys
import numpy as np
import pandas as pd
from pytest import approx
from sklearn.base import clone
import pmdarima
from pmdarima.arima import ARIMA

import skforecast
from skforecast.utils import initialize_lags
from skforecast.utils import check_y
from skforecast.utils import check_exog
from skforecast.utils import preprocess_y
from skforecast.utils import preprocess_last_window
from skforecast.utils import preprocess_exog
from skforecast.utils import expand_index
from skforecast.utils import check_predict_input
from skforecast.utils import transform_series
from skforecast.utils import transform_dataframe
from skforecast.model_selection_sarimax import backtesting_sarimax
logging.basicConfig(
    format = '%(name)-10s %(levelname)-5s %(message)s', 
    level  = logging.INFO,
)

In [3]:
from skforecast.ForecasterSarimax import ForecasterSarimax
from sklearn.linear_model import Ridge
import re
import pytest
from pmdarima.arima import ARIMA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [4]:
y = pd.Series(
        data = np.array([0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897,
                         0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752,
                         0.34317802, 0.72904971, 0.43857224, 0.0596779 , 0.39804426,
                         0.73799541, 0.18249173, 0.17545176, 0.53155137, 0.53182759,
                         0.63440096, 0.84943179, 0.72445532, 0.61102351, 0.72244338,
                         0.32295891, 0.36178866, 0.22826323, 0.29371405, 0.63097612,
                         0.09210494, 0.43370117, 0.43086276, 0.4936851 , 0.42583029,
                         0.31226122, 0.42635131, 0.89338916, 0.94416002, 0.50183668,
                         0.62395295, 0.1156184 , 0.31728548, 0.41482621, 0.86630916,
                         0.25045537, 0.48303426, 0.98555979, 0.51948512, 0.61289453]
            ),
        name = 'y'
    )

exog = pd.Series(
           data = np.array([0.12062867, 0.8263408 , 0.60306013, 0.54506801, 0.34276383,
                            0.30412079, 0.41702221, 0.68130077, 0.87545684, 0.51042234,
                            0.66931378, 0.58593655, 0.6249035 , 0.67468905, 0.84234244,
                            0.08319499, 0.76368284, 0.24366637, 0.19422296, 0.57245696,
                            0.09571252, 0.88532683, 0.62724897, 0.72341636, 0.01612921,
                            0.59443188, 0.55678519, 0.15895964, 0.15307052, 0.69552953,
                            0.31876643, 0.6919703 , 0.55438325, 0.38895057, 0.92513249,
                            0.84167   , 0.35739757, 0.04359146, 0.30476807, 0.39818568,
                            0.70495883, 0.99535848, 0.35591487, 0.76254781, 0.59317692,
                            0.6917018 , 0.15112745, 0.39887629, 0.2408559 , 0.34345601]
               ),
           name = 'exog'
       )

exog_predict = pd.Series(
                  data = np.array([0.12062867, 0.8263408 , 0.60306013, 0.54506801, 0.34276383,
                                   0.30412079, 0.41702221, 0.68130077, 0.87545684, 0.51042234]
                      ),
                  name = 'exog',
                  index = pd.RangeIndex(start=50, stop=60)
               )

df_exog = pd.DataFrame({
              'exog_1': exog.values,
              'exog_2': ['a']*25+['b']*25}
          )
df_exog_predict = df_exog.copy()
df_exog_predict.index = pd.RangeIndex(start=50, stop=100)

y_datetime = pd.Series(data=y.values)
y_datetime.index = pd.date_range(start='2000', periods=50, freq='A')
y_datetime.name = 'y'

lw_datetime = pd.Series(data=y.values)
lw_datetime.index = pd.date_range(start='2050', periods=50, freq='A')
lw_datetime.name = 'y'

exog_datetime = pd.Series(data=exog.values)
exog_datetime.index = pd.date_range(start='2000', periods=50, freq='A')
exog_datetime.name = 'exog'

lw_exog_datetime = pd.Series(data=exog.values)
lw_exog_datetime.index = pd.date_range(start='2050', periods=50, freq='A')
lw_exog_datetime.name = 'exog'

exog_predict_datetime = pd.Series(data=exog_predict.values)
exog_predict_datetime.index = pd.date_range(start='2100', periods=10, freq='A')
exog_predict_datetime.name = 'exog'

df_exog_datetime = df_exog.copy()
df_exog_datetime.index = pd.date_range(start='2000', periods=50, freq='A')

df_lw_exog_datetime = df_exog.copy()
df_lw_exog_datetime.index = pd.date_range(start='2050', periods=50, freq='A')

df_exog_predict_datetime = df_exog.copy()
df_exog_predict_datetime.index = pd.date_range(start='2100', periods=50, freq='A')

In [6]:
forecaster = ForecasterSarimax(regressor=ARIMA(maxiter=1000, order=(1,1,1)))
forecaster.fit(y=y_datetime)
predictions = forecaster.predict_interval(
                    steps            = 5, 
                    alpha            = None,
                    interval         = [2.5, 97.5],
                    last_window      = lw_datetime,
                )
predictions.values

array([[0.5493021 , 0.0918756 , 1.0067286 ],
       [0.53939807, 0.07508138, 1.00371476],
       [0.53833294, 0.07371045, 1.00295544],
       [0.53872299, 0.07407235, 1.00337363],
       [0.53935261, 0.07469791, 1.00400731]])

In [7]:
forecaster = ForecasterSarimax(regressor=ARIMA(maxiter=1000, order=(1,1,1)))
forecaster.fit(y=y_datetime, exog=exog_datetime)
predictions = forecaster.predict_interval(
                    steps            = 5, 
                    alpha            = 0.05,
                    interval         = [1, 99],
                    exog             = exog_predict_datetime, 
                    last_window      = lw_datetime, 
                    last_window_exog = lw_exog_datetime
                )
predictions.values

array([[0.61420452, 0.17100769, 1.05740135],
       [0.45514546, 0.0069039 , 0.90338703],
       [0.50296609, 0.05453606, 0.95139611],
       [0.51581947, 0.06737283, 0.96426612],
       [0.56003087, 0.11158168, 1.00848007]])

In [8]:
transformer_exog = ColumnTransformer(
                        [('scale', StandardScaler(), ['exog_1']),
                        ('onehot', OneHotEncoder(), ['exog_2'])],
                        remainder = 'passthrough',
                        verbose_feature_names_out = False
                    )

forecaster = ForecasterSarimax(
                    regressor        = ARIMA(maxiter=1000, order=(1,1,1)), 
                    transformer_y    = StandardScaler(),
                    transformer_exog = transformer_exog
                )
forecaster.fit(y=y_datetime, exog=df_exog_datetime)
predictions = forecaster.predict_interval(
                    steps            = 5, 
                    alpha            = 0.05,
                    interval         = [1, 99],
                    exog             = df_exog_predict_datetime, 
                    last_window      = lw_datetime, 
                    last_window_exog = df_lw_exog_datetime
                )
predictions.values



array([[1.08795272, 0.65953954, 1.51636591],
       [0.95925274, 0.52900782, 1.38949766],
       [1.01602516, 0.58573837, 1.44631196],
       [1.03637173, 0.60608219, 1.46666126],
       [1.08747734, 0.65718757, 1.51776712]])

In [78]:
alpha = 0.05
interval= [1, 99]

from skforecast.model_selection_sarimax.model_selection_sarimax import random_search_sarimax
from sklearn.metrics import mean_absolute_error

def test_output_random_search_sarimax_sarimax_with_mocked():
    """
    Test output of random_search_sarimax in ForecasterSarimax with mocked
    (mocked done in Skforecast v0.7.0).
    """
    forecaster = ForecasterSarimax(regressor=ARIMA(maxiter=1000, order=(1,1,1)))

    # Generate 15 random `order`
    np.random.seed(123)
    values = [(p,d,q) for p,d,q in zip(np.random.randint(0, high=4, size=3, dtype=int), 
                                       np.random.randint(0, high=4, size=3, dtype=int),
                                       np.random.randint(0, high=4, size=3, dtype=int))]

    param_distributions = {'order': values}

    results = random_search_sarimax(
                  forecaster          = forecaster,
                  y                   = y_datetime,
                  param_distributions = param_distributions,
                  n_iter              = 10,
                  random_state        = 123,
                  steps               = 3,
                  refit               = False,
                  metric              = 'mean_absolute_error',
                  initial_train_size  = len(y_datetime)-12,
                  fixed_train_size    = False,
                  return_best         = False,
                  verbose             = False
              )
    
    expected_results = pd.DataFrame(
        data  = {'params': np.array([{'order': (1, 0, 1)}, {'order': (2, 2, 2)}, {'order': (2, 2, 3)}],
                                    dtype=object),
                'mean_absolute_error': np.array([0.203796, 0.250172, 0.265417]),
                'order'              : [(1, 0, 1), (2, 2, 2), (2, 2, 3)]},
        index = np.array([1, 0, 2])
    )

    pd.testing.assert_frame_equal(results, expected_results, atol=0.001)

In [79]:
test_output_random_search_sarimax_sarimax_with_mocked()



Number of models compared: 3.


loop param_grid: 100%|██████████████████████████████████████| 3/3 [00:00<00:00,  3.66it/s]


In [74]:
forecaster = ForecasterSarimax(regressor=ARIMA(maxiter=1000, order=(1,1,1)))

# Generate 15 random `order`
np.random.seed(123)
values = [(p,d,q) for p,d,q in zip(np.random.randint(0, high=4, size=3, dtype=int), 
                                   np.random.randint(0, high=4, size=3, dtype=int),
                                   np.random.randint(0, high=4, size=3, dtype=int))]

param_distributions = {'order': values}

results = random_search_sarimax(
               forecaster          = forecaster,
               y                   = y_datetime,
               param_distributions = param_distributions,
               n_iter              = 10,
               random_state        = 123,
               steps               = 3,
               refit               = False,
               metric              = 'mean_absolute_error',
               initial_train_size  = len(y_datetime)-12,
               fixed_train_size    = False,
               return_best         = False,
               verbose             = False
          )
results



Number of models compared: 3.


loop param_grid: 100%|██████████████████████████████████████| 3/3 [00:00<00:00,  3.68it/s]


Unnamed: 0,params,mean_absolute_error,order
1,"{'order': (1, 0, 1)}",0.203796,"(1, 0, 1)"
0,"{'order': (2, 2, 2)}",0.250172,"(2, 2, 2)"
2,"{'order': (2, 2, 3)}",0.265417,"(2, 2, 3)"


In [75]:
expected_results = pd.DataFrame(
    data  = {'params': np.array([{'order': (1, 0, 1)}, {'order': (2, 2, 2)}, {'order': (2, 2, 3)}],
                                 dtype=object),
                'mean_absolute_error': np.array([0.203796, 0.250172, 0.265417]),
                'order'              : [(1, 0, 1), (2, 2, 2), (2, 2, 3)]},
    index = np.array([1, 0, 2])
)

expected_results

Unnamed: 0,params,mean_absolute_error,order
1,"{'order': (1, 0, 1)}",0.203796,"(1, 0, 1)"
0,"{'order': (2, 2, 2)}",0.250172,"(2, 2, 2)"
2,"{'order': (2, 2, 3)}",0.265417,"(2, 2, 3)"
