In [2]:
import pickle
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import scipy.stats as stats

In [88]:
MODEL = 0

In [89]:
index = [0,3] if MODEL==0 else [1,2]
model_name = 'th_v_air' if MODEL==0 else 'el_v_sky'

base_data_train, base_data_test = np.load('../../data/training_data/training_data_1month.npy', allow_pickle=True)

base_data_train, base_data_test = base_data_train[:,:,index], base_data_test[:,:,index]
print(base_data_train.shape, base_data_test.shape)

(108, 730, 2) (12, 730, 2)


Scale base data

In [90]:
from sklearn.preprocessing import MinMaxScaler

scalers = {var_name: MinMaxScaler(feature_range=(-1,1)) for var_name in ['G.air.T', 'G.E_th_I']}

temp_var, energy_var = base_data_train[:,:,0], base_data_train[:,:,1]
temp_var_test, energy_var_test = base_data_test[:,:,0], base_data_test[:,:,1]

temp_var, temp_var_test = scalers['G.air.T'].fit_transform(temp_var), scalers['G.air.T'].fit_transform(temp_var_test)
energy_var, energy_var_test = scalers['G.E_th_I'].fit_transform(energy_var), scalers['G.E_th_I'].fit_transform(energy_var_test)

base_data_train_scaled, base_data_test_scaled = np.stack((temp_var, energy_var), axis=-1), np.stack((temp_var_test, energy_var_test), axis=-1)
print(base_data_train_scaled.shape, base_data_test_scaled.shape)

(108, 730, 2) (12, 730, 2)


PCA?

In [96]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(base_data_train_scaled.reshape(-1,2))
X_test_pca = pca.fit_transform(base_data_test_scaled.reshape(-1,2))

In [100]:
X_train_pca.shape

(78840, 2)

<h3> Use SKforecast to tune our Regression Model</h3>

In [101]:
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster 

df = pd.DataFrame(base_data_train_scaled.reshape(-1,2), columns=['G.air.T', 'G.E_th_I'])

In [102]:
print(df['G.air.T'].index.max())

78839


In [103]:
y_test = pd.Series(base_data_test_scaled[:,:,1].reshape(-1))
exog_var = pd.Series(base_data_test_scaled[:,:,0].reshape(-1))
exog_var.index = exog_var.index + df['G.air.T'].index.max() + 1

In [58]:
forecaster = ForecasterAutoreg(
                 regressor = RandomForestRegressor(random_state=123),
                 lags      = 1 # This value will be replaced in the grid search
             )

# Candidate values for lags
lags_grid = [2,6,12,24]

# Candidate values for regressor's hyperparameters
param_grid = {
    'n_estimators': [2,5,10,50,100],
    'max_depth': [3, 5, 10]
}

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = df['G.E_th_I'],
                   exog               = df['G.air.T'],
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = len(y_test),
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = int(len(base_data_train_scaled)*0.5),
                   fixed_train_size   = False,
                   return_best        = True,
                   n_jobs             = 'auto',
                   verbose            = False
               )

results_grid.to_csv(f'../../data/models/model_history/{model_name}_rf_regr_tuning.csv')

Number of models compared: 36.


lags grid:   0%|          | 0/4 [00:00<?, ?it/s]

params grid:   0%|          | 0/9 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2] 
  Parameters: {'max_depth': 3, 'n_estimators': 10}
  Backtesting metric: 0.5515257187624828



In [59]:
results_grid

Unnamed: 0,lags,params,mean_squared_error,max_depth,n_estimators
0,"[1, 2]","{'max_depth': 3, 'n_estimators': 10}",0.551526,3,10
14,"[1, 2, 3, 4, 5, 6]","{'max_depth': 5, 'n_estimators': 100}",0.558745,5,100
17,"[1, 2, 3, 4, 5, 6]","{'max_depth': 10, 'n_estimators': 100}",0.560708,10,100
26,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'max_depth': 10, 'n_estimators': 100}",0.565015,10,100
23,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'max_depth': 5, 'n_estimators': 100}",0.56782,5,100
13,"[1, 2, 3, 4, 5, 6]","{'max_depth': 5, 'n_estimators': 50}",0.568645,5,50
16,"[1, 2, 3, 4, 5, 6]","{'max_depth': 10, 'n_estimators': 50}",0.570912,10,50
10,"[1, 2, 3, 4, 5, 6]","{'max_depth': 3, 'n_estimators': 50}",0.571092,3,50
11,"[1, 2, 3, 4, 5, 6]","{'max_depth': 3, 'n_estimators': 100}",0.571838,3,100
20,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'max_depth': 3, 'n_estimators': 100}",0.578946,3,100


In [104]:
y_test = pd.Series(base_data_test_scaled[:,:,1].reshape(-1))
exog_var = pd.Series(base_data_test_scaled[:,:,0].reshape(-1))
exog_var.index = exog_var.index + df['G.air.T'].index.max() + 1

forecaster = ForecasterAutoreg(
    regressor=RandomForestRegressor(n_estimators=10, max_depth=3),
    lags = 2
)

forecaster.fit(y=df['G.E_th_I'], exog=df['G.air.T'])

predictions = forecaster.predict(exog=exog_var, steps=len(y_test))

mse = mean_squared_error(y_test, predictions)
r2  = r2_score(y_test, predictions)

print(mse, r2)

0.4644114507259547 -0.2559899888106716


Linear Regression with Ridge regularization

In [94]:
from sklearn.linear_model import Ridge, Lasso

y_test = pd.Series(base_data_test_scaled[:,:,1].reshape(-1))
exog_var = pd.Series(base_data_test_scaled[:,:,0].reshape(-1))
exog_var.index = exog_var.index + df['G.air.T'].index.max() + 1

forecaster = ForecasterAutoreg(
    regressor=Ridge(),
    lags = 2
)

forecaster.fit(y=df['G.E_th_I'], exog=df['G.air.T'])

predictions = forecaster.predict(exog=exog_var, steps=len(y_test))

mse = mean_squared_error(y_test, predictions)
r2  = r2_score(y_test, predictions)

print(mse, r2)

0.4410275850474365 -0.07044065224413765


In [45]:
forecaster = ForecasterAutoreg(
                 regressor     = Ridge(random_state=123),
                 lags          = 1
             )

param_grid = {'alpha': np.logspace(-5, 5, 10)}
lags_grid = [2, 6, 12, 24]

results_grid = grid_search_forecaster(
                   forecaster         = forecaster,
                   y                  = df['G.E_th_I'],
                   exog               = df['G.air.T'],
                   param_grid         = param_grid,
                   lags_grid          = lags_grid,
                   steps              = len(y_test),
                   refit              = False,
                   metric             = 'mean_squared_error',
                   initial_train_size = int(len(df)*0.5),
                   fixed_train_size   = False,
                   return_best        = True,
                   n_jobs             = 'auto',
                   verbose            = False
               )

Number of models compared: 40.


lags grid:   0%|          | 0/4 [00:00<?, ?it/s]

params grid:   0%|          | 0/10 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2] 
  Parameters: {'alpha': 1e-05}
  Backtesting metric: 0.1808343266427901



In [46]:
results_grid.to_csv(f'../../data/models/model_history/{model_name}_ridge_regr_tuning.csv')
results_grid

Unnamed: 0,lags,params,mean_squared_error,alpha
0,"[1, 2]",{'alpha': 1e-05},0.180834,1e-05
1,"[1, 2]",{'alpha': 0.0001291549665014884},0.180834,0.000129
2,"[1, 2]",{'alpha': 0.0016681005372000592},0.180834,0.001668
3,"[1, 2]",{'alpha': 0.021544346900318846},0.180834,0.021544
4,"[1, 2]",{'alpha': 0.2782559402207126},0.180835,0.278256
5,"[1, 2]",{'alpha': 3.593813663804626},0.180837,3.593814
6,"[1, 2]",{'alpha': 46.41588833612782},0.180865,46.415888
20,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",{'alpha': 1e-05},0.18103,1e-05
21,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",{'alpha': 0.0001291549665014884},0.18103,0.000129
22,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",{'alpha': 0.0016681005372000592},0.18103,0.001668


In [80]:
forecaster = ForecasterAutoreg(
    regressor=Ridge(alpha=0.00010),
    lags = 12
)

forecaster.fit(y=df['G.E_th_I'], exog=df['G.air.T'])

predictions = forecaster.predict(exog=exog_var, steps=len(y_test))

mse = mean_squared_error(y_test, predictions)
r2  = r2_score(y_test, predictions)

print(mse, r2)

0.44220335496979374 -0.07329442367533212
