### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from trainer_lib import load_country_wide_dataset, TSMWrapper
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
import os

np.random.seed(2909231846)
randomstate = np.random.RandomState(131002)

### Load data

In [None]:
df: pd.DataFrame = load_country_wide_dataset('../data/country_data.csv')

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)
rf_X = np.zeros((len(X)-24-3, 24*X.shape[1]))
rf_y = np.zeros((len(X)-24-3, 3))
for i in range(len(X)-24-3):
    rf_X[i] = X[i:i+24].flatten()
    rf_y[i] = y[i+24:i+24+3]

### Grid search

I'll first use grid search to get an idea of how to tune the parameters, then I'll go ahead and define some myself.

In [None]:
n_cpu = os.cpu_count()
print(f"Number of CPUs in the system: {n_cpu}, n_jobs will be set to {n_cpu-2}")

tscv = TimeSeriesSplit(n_splits=6)
split = tscv.split(rf_X, rf_y)

In [None]:
param_grid = {
    'n_estimators': [150, 300],
    'max_depth': [5, 15],
    'min_samples_split': [2, 5],
    'max_features': ['log2', 'sqrt', 0.5],
}
# Fitting 6 folds for each of 24 candidates, totalling 144 fits

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2),
    scoring='neg_root_mean_squared_error',
    param_grid=param_grid,
    cv=split,
    verbose=4,
    return_train_score=True
)

results = grid_search.fit(rf_X, rf_y)

In [None]:
print(f"Best parameters: {results.best_params_}")
print(f"Best score: {results.best_score_}")

# Best parameters: {'max_depth': 15, 'max_features': 0.5, 'min_samples_split': 2, 'n_estimators': 300}
# Best score: -106.06095301049247

### Estimator number tuning and train size limit

300 estimators only barely score better, it might be worth using less.
The models seem to fit worse and worse as the trainig data size increases past the 2nd fold, let's try to limit it. Maybe we are looking too far into the past.

In [None]:
tscv = TimeSeriesSplit(n_splits=6, max_train_size=rf_X.shape[0]//3)
split = tscv.split(rf_X, rf_y)

score = cross_val_score(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2, max_depth=15, max_features=0.5, min_samples_split=2, n_estimators=150),
    rf_X, rf_y,
    scoring='neg_root_mean_squared_error',
    cv=split,
    verbose=4,
)

There is mild improvement, but not much, I still want to test less estimators and maybe bring back the covid column.

### Covid column

In [None]:
df['covid'] = 0
df.loc['2020-03-11 00:00:00':'2022-03-7 23:00:00', 'covid'] = 1
df['covid'] = df['covid'].astype(float)

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)
rf_X = np.zeros((len(X)-24-3, 24*X.shape[1]))
rf_y = np.zeros((len(X)-24-3, 3))
for i in range(len(X)-24-3):
    rf_X[i] = X[i:i+24].flatten()
    rf_y[i] = y[i+24:i+24+3]
    
tscv = TimeSeriesSplit(n_splits=6, max_train_size=rf_X.shape[0]//2) # I'll give it more train data than before
split = tscv.split(rf_X, rf_y)
    
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [15],
    'min_samples_split': [2],
    'max_features': [0.5, 1.0],
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2),
    scoring='neg_root_mean_squared_error',
    param_grid=param_grid,
    cv=split,
    verbose=4,
    return_train_score=True
)

results = grid_search.fit(rf_X, rf_y)

In [None]:
print(f"Best parameters: {results.best_params_}")
print(f"Best score: {results.best_score_}")

### Test size tuning

It might also be worth to check smaller test sizes, since we can retrain the model per month for example. I suspect as we move away from the training data, predictions get worse.

In [None]:
tscv = TimeSeriesSplit(n_splits=6, max_train_size=rf_X.shape[0]//3, test_size=24*30*2)  # 2 months of test data
split = tscv.split(rf_X, rf_y)

score = cross_val_score(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2, max_depth=15, max_features=0.5, min_samples_split=2, n_estimators=150),
    rf_X, rf_y,
    scoring='neg_root_mean_squared_error',
    cv=split,
    verbose=4,
)

print(f"Mean score: {score.mean()}")

We get very inconsistent testing results, I'll increase test size to 6 months.
I also want to try a higher depth.

### Depth tuning

In [None]:
tscv = TimeSeriesSplit(n_splits=6, max_train_size=rf_X.shape[0]//3, test_size=24*30*6)  # 6 months of test data
split = tscv.split(rf_X, rf_y)

score = cross_val_score(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2, max_depth=25, max_features=0.5, min_samples_split=2, n_estimators=150),
    rf_X, rf_y,
    scoring='neg_root_mean_squared_error',
    cv=split,
    verbose=4,
)

print(f"Mean score: {score.mean()}")

I think depth made a difference, but test size might be working against us.
I'll also remove the limit on training size.

In [None]:
df: pd.DataFrame = df.drop(columns=['covid'], errors='ignore')

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)
rf_X = np.zeros((len(X)-24-3, 24*X.shape[1]))
rf_y = np.zeros((len(X)-24-3, 3))
for i in range(len(X)-24-3):
    rf_X[i] = X[i:i+24].flatten()
    rf_y[i] = y[i+24:i+24+3]

tscv = TimeSeriesSplit(n_splits=6)
split = tscv.split(rf_X, rf_y)

score = cross_val_score(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2, max_depth=50, max_features=0.75, n_estimators=150),
    rf_X, rf_y,
    scoring='neg_root_mean_squared_error',
    cv=split,
    verbose=4,
)

print(f"Mean score: {score.mean()}")

In [None]:
df: pd.DataFrame = df.drop(columns=['covid'], errors='ignore')

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)
rf_X = np.zeros((len(X)-24-3, 24*X.shape[1]))
rf_y = np.zeros((len(X)-24-3, 3))
for i in range(len(X)-24-3):
    rf_X[i] = X[i:i+24].flatten()
    rf_y[i] = y[i+24:i+24+3]
    
rf = RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2, max_depth=50, max_features=0.75, n_estimators=150)
splitpoint = rf_X.shape[0] // 7  # to stay consistent with the other models
rf.fit(rf_X[:-splitpoint], rf_y[:-splitpoint])

TSMWrapper.print_evaluation_info(rf.predict(rf_X[-splitpoint:]), rf_y[-splitpoint:])