### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from trainer_lib import load_country_wide_dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
import os

np.random.seed(2909231846)
randomstate = np.random.RandomState(131002)

### Load data

In [11]:
df: pd.DataFrame = load_country_wide_dataset('../data/country_data.csv')

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)
rf_X = np.zeros((len(X)-24-3, 24*X.shape[1]))
rf_y = np.zeros((len(X)-24-3, 3))
for i in range(len(X)-24-3):
    rf_X[i] = X[i:i+24].flatten()
    rf_y[i] = y[i+24:i+24+3]

### Grid search

I'll first use grid search to get an idea of how to tune the parameters, then I'll go ahead and define some myself.

In [12]:
n_cpu = os.cpu_count()
print(f"Number of CPUs in the system: {n_cpu}, n_jobs will be set to {n_cpu-2}")

tscv = TimeSeriesSplit(n_splits=6)
split = tscv.split(rf_X, rf_y)

param_grid = {
    'n_estimators': [150, 300],
    'max_depth': [5, 15],
    'min_samples_split': [2, 5],
    'max_features': ['log2', 'sqrt', 0.5],
}
# Fitting 6 folds for each of 24 candidates, totalling 144 fits

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=randomstate, n_jobs=n_cpu-2),
    scoring='neg_root_mean_squared_error',
    param_grid=param_grid,
    cv=split,
    verbose=4,
    return_train_score=True
)

results = grid_search.fit(rf_X, rf_y)

Number of CPUs in the system: 12, n_jobs will be set to 10
Fitting 6 folds for each of 24 candidates, totalling 144 fits
[CV 1/6] END max_depth=5, max_features=log2, min_samples_split=2, n_estimators=150;, score=(train=-205.135, test=-254.296) total time=   0.5s
[CV 2/6] END max_depth=5, max_features=log2, min_samples_split=2, n_estimators=150;, score=(train=-220.561, test=-236.658) total time=   0.9s
[CV 3/6] END max_depth=5, max_features=log2, min_samples_split=2, n_estimators=150;, score=(train=-222.252, test=-275.117) total time=   1.3s
[CV 4/6] END max_depth=5, max_features=log2, min_samples_split=2, n_estimators=150;, score=(train=-230.289, test=-320.868) total time=   2.0s
[CV 5/6] END max_depth=5, max_features=log2, min_samples_split=2, n_estimators=150;, score=(train=-241.919, test=-327.881) total time=   2.5s
[CV 6/6] END max_depth=5, max_features=log2, min_samples_split=2, n_estimators=150;, score=(train=-254.395, test=-338.429) total time=   2.9s
[CV 1/6] END max_depth=5, m

In [15]:
print(f"Best parameters: {results.best_params_}")
print(f"Best score: {results.best_score_}")

# Best parameters: {'max_depth': 15, 'max_features': 0.5, 'min_samples_split': 2, 'n_estimators': 300}
# Best score: -106.06095301049247

Best parameters: {'max_depth': 15, 'max_features': 0.5, 'min_samples_split': 2, 'n_estimators': 300}
Best score: -106.06095301049247
