In [1]:
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from sklearn.linear_model import LinearRegression
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np
import skforecast
import lightgbm
from xgboost import XGBRegressor
import xgboost
from numpy.random import Generator, PCG64

%load_ext pyinstrument

In [2]:
print(skforecast.__version__)
print(lightgbm.__version__)
print(xgboost.__version__)

0.7.0
3.3.5
1.7.4


In [10]:
data = pd.read_csv("data.csv")
data.date_time = pd.to_datetime(data.date_time)
data= data.set_index('date_time')
data = data.asfreq('H')

# Split train-validation-test
# ==============================================================================
end_train = '2012-03-31 23:59:00'
end_validation = '2012-08-31 23:59:00'
data_train = data.loc[: end_train, :]
data_val   = data.loc[end_train:end_validation, :]
data_test  = data.loc[end_validation:, :]

print(f"Dates train      : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Dates validacion : {data_val.index.min()} --- {data_val.index.max()}  (n={len(data_val)})")
print(f"Dates test       : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")

# Create forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                 regressor = XGBRegressor(random_state=123),
                 lags = 24
             )

forecaster = ForecasterAutoreg(
                 regressor = LGBMRegressor(random_state=123, n_estimators=500),
                 lags = 24
             )


# forecaster = ForecasterAutoreg(
#                  regressor = Ridge(random_state=123),
#                  lags = 24
#              )

Dates train      : 2011-01-01 00:00:00 --- 2012-03-31 23:00:00  (n=10944)
Dates validacion : 2012-04-01 00:00:00 --- 2012-08-31 23:00:00  (n=3672)
Dates test       : 2012-09-01 00:00:00 --- 2012-12-31 23:00:00  (n=2928)


In [11]:
%%timeit
forecaster.fit(y= data.loc[:end_validation, 'users'],)

641 ms ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%pyinstrument
forecaster.predict(1000)

2012-09-01 00:00:00    122.791461
2012-09-01 01:00:00     79.588384
2012-09-01 02:00:00     44.759529
2012-09-01 03:00:00     21.803527
2012-09-01 04:00:00      5.661290
                          ...    
2012-10-12 11:00:00    151.444638
2012-10-12 12:00:00    105.380524
2012-10-12 13:00:00     82.609657
2012-10-12 14:00:00     62.766092
2012-10-12 15:00:00     45.457549
Freq: H, Name: pred, Length: 1000, dtype: float64

In [14]:
%%pyinstrument
_ = backtesting_forecaster(
                   forecaster         = forecaster,
                   y                  = data.loc[:end_validation, 'users'], # Train and validation data
                   steps              = 36,
                   refit              = True,
                   metric             = 'mean_squared_error',
                   initial_train_size = len(data_train)
)




In [25]:
%%pyinstrument
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': [100],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
}

# Lags used as predictors
lags_grid = [48, 72]

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = data.loc[:end_validation, 'users'], # Train and validation data
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = 36,
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = len(data_train), # Model is trained with training data
                   fixed_train_size   = False,
                   return_best        = True,
                   verbose            = False
               )

Number of models compared: 12.


loop lags_grid: 100%|███████████████████████████████████████| 2/2 [01:33<00:00, 46.67s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72] 
  Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
  Backtesting metric: 14407.853484255475



In [26]:
%%pyinstrument
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': [1000],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
}

# Lags used as predictors
lags_grid = [48, 72]

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = data.loc[:end_validation, 'users'], # Train and validation data
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = 36,
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = len(data_train), # Model is trained with training data
                   fixed_train_size   = False,
                   return_best        = True,
                   verbose            = False
               )

Number of models compared: 12.


loop lags_grid: 100%|██████████████████████████████████████| 2/2 [05:00<00:00, 150.08s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72] 
  Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}
  Backtesting metric: 13605.107261105946



In [6]:
%%pyinstrument


forecaster = ForecasterAutoreg(
                 regressor = XGBRegressor(random_state=123, n_estimators=500),
                 lags = 24
             )

             
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
}

# Lags used as predictors
lags_grid = [48, 72]

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = data.loc[:end_validation, 'users'], # Train and validation data
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = 36,
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = len(data_train), # Model is trained with training data
                   fixed_train_size   = False,
                   return_best        = True,
                   verbose            = False
               )

Number of models compared: 24.


loop lags_grid: 100%|██████████████████████████████████████| 2/2 [03:21<00:00, 100.54s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72] 
  Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}
  Backtesting metric: 13132.573900322006



In [12]:
%%pyinstrument


forecaster = ForecasterAutoreg(
                 regressor = LGBMRegressor(random_state=123, n_estimators=500),
                 lags = 24
             )

             
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
}

# Lags used as predictors
lags_grid = [48, 72]

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = data.loc[:end_validation, 'users'], # Train and validation data
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = 36,
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = len(data_train), # Model is trained with training data
                   fixed_train_size   = False,
                   return_best        = False,
                   verbose            = False
               )

Number of models compared: 24.


loop lags_grid: 100%|███████████████████████████████████████| 2/2 [00:35<00:00, 17.80s/it]


In [11]:
%%pyinstrument


forecaster = ForecasterAutoreg(
                 regressor = Ridge(random_state=123,),
                 lags = 24
             )

             
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
 'alpha': [1,0.1,0.01,0.001,0.0001,0],
 "fit_intercept": [True, False],
 "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
 }

# Lags used as predictors
lags_grid = [48, 72]

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = data.loc[:end_validation, 'users'], # Train and validation data
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = 36,
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = len(data_train), # Model is trained with training data
                   fixed_train_size   = False,
                   return_best        = False,
                   verbose            = False
               )

Number of models compared: 144.


loop lags_grid: 100%|███████████████████████████████████████| 2/2 [01:01<00:00, 30.51s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72] 
  Parameters: {'alpha': 0, 'fit_intercept': False, 'solver': 'sag'}
  Backtesting metric: 15698.22355206917



In [27]:
import session_info
session_info.show(html=False)

-----
lightgbm            3.3.5
numpy               1.23.5
pandas              1.5.3
pyinstrument        4.4.0
session_info        1.0.0
skforecast          0.7.0
sklearn             1.2.1
xgboost             1.7.4
-----
IPython             8.10.0
jupyter_client      8.0.3
jupyter_core        5.2.0
notebook            6.5.4
-----
Python 3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64 bit (AMD64)]
Windows-10-10.0.19045-SP0
-----
Session information updated at 2023-05-22 21:06
