In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

# Understanding joblib multiprocess

In [2]:
from joblib import Parallel, delayed
import multiprocessing
import pandas as pd
import numpy as np
import time
from tqdm.auto import tqdm

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from skforecast.model_selection import _create_backtesting_folds

# Create a dataframe with random values and 3 columns
n=10_000
df = pd.DataFrame(np.random.randint(0,100,size=(n, 30)))

backtesting_folds = _create_backtesting_folds(
    data = df,
    initial_train_size=int(n/2),
    test_size=24,
    refit=True,
    return_all_indexes=True,
    fixed_train_size=False,
    verbose=False
)
backtesting_folds = [fold[:2] for fold in backtesting_folds]
print(len(backtesting_folds))
backtesting_folds

209


[[range(0, 5000), range(5000, 5024)],
 [range(0, 5024), range(5024, 5048)],
 [range(0, 5048), range(5048, 5072)],
 [range(0, 5072), range(5072, 5096)],
 [range(0, 5096), range(5096, 5120)],
 [range(0, 5120), range(5120, 5144)],
 [range(0, 5144), range(5144, 5168)],
 [range(0, 5168), range(5168, 5192)],
 [range(0, 5192), range(5192, 5216)],
 [range(0, 5216), range(5216, 5240)],
 [range(0, 5240), range(5240, 5264)],
 [range(0, 5264), range(5264, 5288)],
 [range(0, 5288), range(5288, 5312)],
 [range(0, 5312), range(5312, 5336)],
 [range(0, 5336), range(5336, 5360)],
 [range(0, 5360), range(5360, 5384)],
 [range(0, 5384), range(5384, 5408)],
 [range(0, 5408), range(5408, 5432)],
 [range(0, 5432), range(5432, 5456)],
 [range(0, 5456), range(5456, 5480)],
 [range(0, 5480), range(5480, 5504)],
 [range(0, 5504), range(5504, 5528)],
 [range(0, 5528), range(5528, 5552)],
 [range(0, 5552), range(5552, 5576)],
 [range(0, 5576), range(5576, 5600)],
 [range(0, 5600), range(5600, 5624)],
 [range(0, 5

In [3]:
def _fit_predict_1(df, folds, regressor=LinearRegression()):
    """
    Fit forecaster and predict `steps` ahead. This function is used to test
    the parallelization of the backtesting_forecaster function. The whole
    dataframe is passed to the function, but only the train and test sets
    corresponding to the fold are used.
    """
    regressor = LinearRegression()
    regressor.fit(X=df.iloc[folds[0], 1:], y=df.iloc[folds[0], 0])
    pred = regressor.predict(df.iloc[folds[1], 1:])

    return pred


def _fit_predict_2(data_train, data_test, regressor=LinearRegression()):
    """
    Fit forecaster and predict `steps` ahead. This function is used to test
    the parallelization of the backtesting_forecaster function. Only the data
    corresponding to the fold are passed to the function.
    """
    regressor = LinearRegression()
    regressor.fit(X=data_train.iloc[:, 1:], y=data_train.iloc[:, 0])
    pred = regressor.predict(X=data_test.iloc[:, 1:])

    return pred

In [45]:
regressor = LinearRegression()
n_jobs  = multiprocessing.cpu_count()

In [None]:
%%timeit
# Sequential execution _fit_predict_1
results = [_fit_predict_1(df, folds, regressor) for folds in backtesting_folds]

In [27]:
%%timeit
# Parallel execution _fit_predict_1
results = (
    Parallel(n_jobs=n_jobs)
    (delayed(_fit_predict_1)(df, folds, regressor) for folds in backtesting_folds)
)

920 ms ± 5.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%%timeit
# Sequential execution _fit_predict_2
results = [
    _fit_predict_2(df.iloc[folds[0],:], df.iloc[folds[1],:], regressor)
    for folds in backtesting_folds
]

3.36 s ± 4.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%%timeit
# Parallel execution _fit_predict_2
results = (
    Parallel(n_jobs=n_jobs)
    (delayed(_fit_predict_2)(df.iloc[folds[0],:], df.iloc[folds[1],:], regressor)
    for folds in backtesting_folds)
)

1.35 s ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
partitions_backtesting = [[df.iloc[folds[0],:], df.iloc[folds[1],:]] for folds in backtesting_folds]

In [34]:
%%timeit
# Sequential execution _fit_predict_2
results = [
    _fit_predict_2(partition[0], partition[1], regressor)
    for partition in partitions_backtesting
]

2.35 s ± 6.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
%%timeit
# Parallel execution _fit_predict_2
results = (
    Parallel(n_jobs=n_jobs)
    (delayed(_fit_predict_2)(partition[0], partition[1], regressor)
    for partition in partitions_backtesting)
)

925 ms ± 7.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [61]:
n=5_000
df = pd.DataFrame(np.random.randint(0, 100, size=(n, 30)))
print(f"Shape of dataframe: {df.shape}")
backtesting_folds = _create_backtesting_folds(
    data = df,
    initial_train_size=int(n/2),
    test_size=24,
    refit=True,
    return_all_indexes=True,
    fixed_train_size=False,
    verbose=False
)
backtesting_folds = [fold[:2] for fold in backtesting_folds]
print(f"Number of folds: {len(backtesting_folds)}")

n_jobs  = multiprocessing.cpu_count()
print(f"Number of CPUs: {n_jobs}")
partitions_backtesting = [[df.iloc[folds[0],:], df.iloc[folds[1],:]] for folds in backtesting_folds]
regressor = HistGradientBoostingRegressor()

# Benchmarking
# ==============================================================================
print("")
print("_fit_predict_1")
print("==============")
# Sequential execution _fit_predict_1
start = time.time()
results = [_fit_predict_1(df, folds, regressor) for folds in tqdm(backtesting_folds)]
elapsed = time.time() - start
print(f"Elapsed time _fit_predict_1 sequential: {elapsed:.2f} seconds")

# Parallel execution _fit_predict_1
start = time.time()
results = (
    Parallel(n_jobs=n_jobs)
    (delayed(_fit_predict_1)(df, folds, regressor) for folds in tqdm(backtesting_folds))
)
elapsed = time.time() - start
print(f"Elapsed time _fit_predict_1 parallel: {elapsed:.2f} seconds")
print("")

print("_fit_predict_2")
print("==============")
# Sequential execution _fit_predict_2
start = time.time()
results = [
    _fit_predict_2(df.iloc[folds[0],:], df.iloc[folds[1],:], regressor)
    for folds in tqdm(backtesting_folds)
]
elapsed = time.time() - start
print(f"Elapsed time _fit_predict_2 sequential: {elapsed:.2f} seconds")

# Parallel execution _fit_predict_2
start = time.time()
results = (
    Parallel(n_jobs=n_jobs)
    (delayed(_fit_predict_2)(df.iloc[folds[0],:], df.iloc[folds[1],:], regressor)
    for folds in tqdm(backtesting_folds))
)
elapsed = time.time() - start
print(f"Elapsed time _fit_predict_2 parallel: {elapsed:.2f} seconds")
print("")


print("_fit_predict_2: list of partitions")
print("==============")
# Sequential execution _fit_predict_2
start = time.time()
results = [
    _fit_predict_2(partition[0], partition[1], regressor)
    for partition in tqdm(partitions_backtesting)
]
elapsed = time.time() - start
print(f"Elapsed time _fit_predict_2 sequential: {elapsed:.2f} seconds")

# Parallel execution _fit_predict_2
start = time.time()
results = (
    Parallel(n_jobs=n_jobs)
    (delayed(_fit_predict_2)(partition[0], partition[1], regressor)
    for partition in tqdm(partitions_backtesting))
)
elapsed = time.time() - start
print(f"Elapsed time _fit_predict_2 parallel: {elapsed:.2f} seconds")
print("")


Shape of dataframe: (50000, 30)
Number of folds: 1042
Number of CPUs: 8

_fit_predict_1


  0%|          | 0/1042 [00:00<?, ?it/s]

Elapsed time _fit_predict_1 sequential: 64.37 seconds


  0%|          | 0/1042 [00:00<?, ?it/s]

Elapsed time _fit_predict_1 parallel: 34.65 seconds

_fit_predict_2


  0%|          | 0/1042 [00:00<?, ?it/s]

Elapsed time _fit_predict_2 sequential: 51.10 seconds


  0%|          | 0/1042 [00:00<?, ?it/s]

Elapsed time _fit_predict_2 parallel: 33.09 seconds

_fit_predict_2: list of partitions


  0%|          | 0/1042 [00:00<?, ?it/s]

Elapsed time _fit_predict_2 sequential: 40.99 seconds


  0%|          | 0/1042 [00:00<?, ?it/s]

Elapsed time _fit_predict_2 parallel: 32.74 seconds



## Parallel barcktesting

In [7]:
from typing import Union, Tuple, Optional, Any, Callable
import numpy as np
import pandas as pd
import warnings
import logging
from copy import deepcopy
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler
import optuna
from optuna.samplers import TPESampler, RandomSampler

from skforecast.exceptions import LongTrainingWarning
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.utils import check_backtesting_input

from joblib import Parallel, delayed, cpu_count

from skforecast.model_selection import _backtesting_forecaster_refit
from skforecast.model_selection import _backtesting_forecaster_verbose
from skforecast.model_selection import _get_metric
from skforecast.ForecasterAutoreg import ForecasterAutoreg


In [8]:
def _backtesting_forecaster_refit_parallel(
    forecaster,
    y: pd.Series,
    steps: int,
    metric: Union[str, Callable, list],
    initial_train_size: int,
    fixed_train_size: bool=True,
    gap: int=0,
    allow_incomplete_fold: bool=True,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None,
    interval: Optional[list]=None,
    n_boot: int=500,
    random_state: int=123,
    in_sample_residuals: bool=True,
    verbose: bool=False,
    n_jobs: int=1,
    show_progress: bool=True
) -> Tuple[Union[float, list], pd.DataFrame]:
    """
    Backtesting of forecaster model with a re-fitting strategy. A copy of the  
    original forecaster is created so it is not modified during the process.
    
    In each iteration:
        - Fit forecaster with the training set.
        - A number of `steps` ahead are predicted.
        - The training set increases with `steps` observations.
        - The model is re-fitted using the new training set.

    In order to apply backtesting with refit, an initial training set must be
    available, otherwise it would not be possible to increase the training set 
    after each iteration. `initial_train_size` must be provided.
    
    Parameters
    ----------
    forecaster : ForecasterAutoreg, ForecasterAutoregCustom, ForecasterAutoregDirect
        Forecaster model.
    y : pandas Series
        Training time series.
    steps : int
        Number of steps to predict.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
            - If string: {'mean_squared_error', 'mean_absolute_error',
             'mean_absolute_percentage_error', 'mean_squared_log_error'}
            - If Callable: Function with arguments y_true, y_pred that returns a float.
            - If list: List containing multiple strings and/or Callables.
    initial_train_size : int
        Number of samples in the initial train split. The backtest forecaster is
        trained using the first `initial_train_size` observations.
    fixed_train_size : bool, default `True`
        If True, train size doesn't increase but moves by `steps` in each iteration.
    gap : int, default `0`
        Number of samples to be excluded after the end of each training set and 
        before the test set.
    allow_incomplete_fold : bool, default `True`
        Last fold is allowed to have a smaller number of samples than the 
        `test_size`. If `False`, the last fold is excluded.
    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    interval : list, default `None`
        Confidence of the prediction interval estimated. Sequence of percentiles
        to compute, which must be between 0 and 100 inclusive. For example, 
        interval of 95% should be as `interval = [2.5, 97.5]`. If `None`, no
        intervals are estimated.
    n_boot : int, default `500`
        Number of bootstrapping iterations used to estimate prediction
        intervals.
    random_state : int, default `123`
        Sets a seed to the random generator, so that boot intervals are always 
        deterministic.
    in_sample_residuals : bool, default `True`
        If `True`, residuals from the training data are used as proxy of prediction
        error to create prediction intervals. If `False`, out_sample_residuals 
        are used if they are already stored inside the forecaster.
    n_jobs : int, default 1
        Number of jobs to run in parallel. If -1, then the number of jobs is set
        to the number of cores.
    verbose : bool, default `False`
        Print number of folds and index of training and validation sets used 
        for backtesting.
    show_progress: bool, default `True`
        Whether to show a progress bar. Defaults to True.

    Returns
    -------
    metrics_value : float, list
        Value(s) of the metric(s).
    backtest_predictions : pandas Dataframe
        Value of predictions and their estimated interval if `interval` is not `None`.

            - column pred: predictions.
            - column lower_bound: lower bound of the interval.
            - column upper_bound: upper bound of the interval.
    
    """

    n_jobs = n_jobs if n_jobs > 0 else cpu_count()
    forecaster = deepcopy(forecaster)

    if not isinstance(metric, list):
        metrics = [_get_metric(metric=metric) if isinstance(metric, str) else metric]
    else:
        metrics = [_get_metric(metric=m) if isinstance(m, str) else m for m in metric]

    folds = _create_backtesting_folds(
                data                  = y,
                test_size             = steps,
                initial_train_size    = initial_train_size,
                gap                   = gap,
                refit                 = True,
                fixed_train_size      = fixed_train_size,
                allow_incomplete_fold = allow_incomplete_fold,
                return_all_indexes    = False,
                verbose               = verbose  
            )
        
    if type(forecaster).__name__ != 'ForecasterAutoregDirect' and len(folds) > 50:
        warnings.warn(
            (f"The forecaster will be fit {len(folds)} times. This can take substantial"
             f" amounts of time. If not feasible, try with `refit = False`.\n"),
            LongTrainingWarning
        )
    elif type(forecaster).__name__ == 'ForecasterAutoregDirect' and len(folds)*forecaster.steps > 50:
        warnings.warn(
            (f"The forecaster will be fit {len(folds)*forecaster.steps} times "
             f"({len(folds)} folds * {forecaster.steps} regressors). This can take "
             f"substantial amounts of time. If not feasible, try with `refit = False`.\n"),
             LongTrainingWarning
        )

    store_in_sample_residuals = False if interval is None else True

    def _fit_predict_forecaster(y, exog, forecaster, interval, fold):
        '''
        Fit forecaster and predict `steps` ahead.
        '''
        train_idx_start = fold[0][0]
        train_idx_end   = fold[0][1]
        test_idx_start  = fold[1][0]
        test_idx_end    = fold[1][1]
        
        y_train = y.iloc[train_idx_start:train_idx_end, ]
        exog_train = exog.iloc[train_idx_start:train_idx_end, ] if exog is not None else None
        next_window_exog = exog.iloc[test_idx_start:test_idx_end, ] if exog is not None else None

        forecaster.fit(
            y                         = y_train, 
            exog                      = exog_train, 
            store_in_sample_residuals = store_in_sample_residuals
        )
        steps = len(range(test_idx_start, test_idx_end))
        if interval is None:
            pred = forecaster.predict(steps=steps, exog=next_window_exog)
        else:
            pred = forecaster.predict_interval(
                       steps               = steps,
                       exog                = next_window_exog,
                       interval            = interval,
                       n_boot              = n_boot,
                       random_state        = random_state,
                       in_sample_residuals = in_sample_residuals
                   )

        pred = pred.iloc[gap:, ]

        return pred

    backtest_predictions = (
        Parallel(n_jobs=n_jobs)
        (delayed(_fit_predict_forecaster)
        (y=y, exog=exog, forecaster=forecaster, interval=interval, fold=fold)
        for fold in tqdm(folds))
    )        
    backtest_predictions = pd.concat(backtest_predictions)
    if isinstance(backtest_predictions, pd.Series):
        backtest_predictions = pd.DataFrame(backtest_predictions)

    metrics_values = [m(
                        y_true = y.loc[backtest_predictions.index],
                        y_pred = backtest_predictions['pred']
                      ) for m in metrics
                     ]
    
    if not isinstance(metric, list):
        metrics_values = metrics_values[0]

    return metrics_values, backtest_predictions

In [39]:
forecaster = ForecasterAutoreg(regressor=HistGradientBoostingRegressor(random_state=666), lags=50)
forecaster = ForecasterAutoreg(regressor=LinearRegression(), lags=50)
n=5_000
y = pd.Series(np.random.randint(0,100,size=(n)))
y_train = y[:-int(n/2)]

In [40]:
metric, backtest_predictions = _backtesting_forecaster_refit(
                                    forecaster          = forecaster,
                                    y                   = y,
                                    exog                = None,
                                    initial_train_size  = len(y_train),
                                    fixed_train_size    = False,
                                    steps               = 24,
                                    metric              = 'mean_squared_error',
                                    interval            = None,
                                    n_boot              = 500,
                                    random_state        = 123,
                                    in_sample_residuals = True,
                                    verbose             = False
                            )

 


  0%|          | 0/105 [00:00<?, ?it/s]

In [41]:
metric_parallel, backtest_predictions_parallel = _backtesting_forecaster_refit_parallel(
                                    forecaster          = forecaster,
                                    y                   = y,
                                    exog                = None,
                                    initial_train_size  = len(y_train),
                                    fixed_train_size    = False,
                                    steps               = 24,
                                    metric              = 'mean_squared_error',
                                    interval            = None,
                                    n_boot              = 500,
                                    random_state        = 123,
                                    in_sample_residuals = True,
                                    n_jobs              = -1,
                                    verbose             = False
                            )

 


  0%|          | 0/105 [00:00<?, ?it/s]

In [38]:
assert metric_parallel == metric
pd.testing.assert_frame_equal(backtest_predictions_parallel, backtest_predictions)