### Imports

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
import trainer_lib as tl
from torch_model_definitions import GaussianNoise

torch.manual_seed(310231551)
random.seed(3009231410)
np.random.seed(2909231846)
np_random_state = np.random.RandomState(131002)

### Load data

In [2]:
df: pd.DataFrame = tl.load_country_wide_dataset('../data/country_data.csv')

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)

### Define models

In [3]:
class LSTMModel(nn.Module):
    def __init__(self, features=11, hidden_size=15, num_layers=2, dropout=0.0, in_noise=0.0, hid_noise=0.0, bidirectional=True, **kwargs):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.h_n_dim = 2 if bidirectional else 1
        self.num_layers = num_layers
        self.in_noise = GaussianNoise(in_noise)
        rec_drop = dropout if num_layers > 1 else 0.0
        self.lstm = nn.LSTM(input_size=features, hidden_size=self.hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=rec_drop)
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        self.fc = nn.Sequential(
            nn.Flatten(),
            GaussianNoise(hid_noise),
            nn.Dropout(dropout),
            nn.Linear(self.hidden_size * self.h_n_dim * self.num_layers, 3)
        )

    def forward(self, x):
        x = self.in_noise(x)
        batch_size = x.shape[0]
        h_0 = torch.zeros(self.h_n_dim * self.num_layers, batch_size, self.hidden_size).to(tl.TRAINER_LIB_DEVICE)
        c_0 = torch.zeros(self.h_n_dim * self.num_layers, batch_size, self.hidden_size).to(tl.TRAINER_LIB_DEVICE)

        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        h_n = torch.permute(h_n, (1, 0, 2)) # From shape [h_n_dim, batch, hidden_size] -> [batch, h_n_dim, hidden_size]
                                            # flatten and fully connected layer expects batch to be the first dimension
        return self.fc(h_n)
    
    
class LSTMParams(LSTMModel):
    def __init__(self, param_group='2-20', dropout=0.0, in_noise=0.0, hid_noise=0.0, **kwargs):
        if param_group == '3-20':
            super(LSTMParams, self).__init__(11, hidden_size=20, num_layers=3, bidirectional=True, dropout=dropout, in_noise=in_noise, hid_noise=hid_noise)
        elif param_group == '3-15':
            super(LSTMParams, self).__init__(11, hidden_size=15, num_layers=3, bidirectional=True, dropout=dropout, in_noise=in_noise, hid_noise=hid_noise)
        else:
            super(LSTMParams, self).__init__(11, hidden_size=20, num_layers=2, bidirectional=True, dropout=dropout, in_noise=in_noise, hid_noise=hid_noise)

### Grid search

I'll first look at different model constructions, then I'll look into hyperparameters, dropouts, noise and maybe higher sequence lengths.

In [None]:
grid = tl.Grid({
    'epochs': [1000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [LSTMModel],
    'hidden_size': [15, 30],
    'num_layers': [1, 2],
    'bidirectional': [False, True],
    'n_splits': [6],
})

wrapper = tl.MIMOTSWrapper(LSTMModel(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

I need to try out different validation set sizes.

### Validation set size

In [None]:
n_p = {k: [v] for k, v in b_p.items()}
n_p['val_mod'] = [2, 3, 4, 5, 6, 7]
grid = tl.Grid(n_p)

wrapper = tl.MIMOTSWrapper(LSTMModel(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

In [None]:
n_p = {k: [v] for k, v in b_p.items()}
n_p['val_mod'] = [7, 8, 9, 10, 11, 12]
grid = tl.Grid(n_p)

wrapper = tl.MIMOTSWrapper(LSTMModel(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

11 performed the best, but it seems quite unstable, I'll go with 8, since it seems to be the most stable. (This will be the default in trainer_lib.py 

### Fine tune model parameters

Bidirectional models seem to perform better. Let's test different hidden and layer sizes.

In [None]:
grid = tl.Grid({
    'epochs': [1000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [LSTMModel],
    'hidden_size': [10, 15, 20],
    'num_layers': [1, 2, 3],
    'bidirectional': [True],
    'n_splits': [6],
    'dropout': [0.3],
}) # val_mod is default at 8

wrapper = tl.MIMOTSWrapper(LSTMModel(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=3)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

(num_layers) - (hidden_sizes): (notes)
- 2 - 20: I see a lot of consistency
- 3 - 15: Not too consistent, but has good overall score
- 3 - 20: Seems somewhat consistent, has the best overall score

I'll do some quick learning rate testing.

In [None]:
grid = tl.Grid({
    'epochs': [1000],  # we use early stopping, so this is just a high number
    'lr': [0.001, 0.0005, 0.0001],
    'model': [LSTMParams],
    'param_group': ['3-20'],
    'n_splits': [6],
    'dropout': [0.3],
}) # val_mod is default at 8

b_p, b_s = wrapper.grid_search(X, y, grid, verbose=3)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

Learning rate 0.0001 is very promising, it's also pretty consistent.

### Noise

In [None]:
grid = tl.Grid({
    'epochs': [1000],  # we use early stopping, so this is just a high number
    'lr': [0.0001],
    'model': [LSTMParams],
    'param_group': ['2-20', '3-20'],
    'n_splits': [6],
    'dropout': [0.3],  # I'll test dropout next
    'in_noise': [0.0, 0.05],
    'hid_noise': [0.0, 0.05],
}) # val_mod is default at 8

wrapper = tl.MIMOTSWrapper(LSTMParams(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=3)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

3-20 configuration seems more consistent, input noise throws off the model too much at this point.
Time to test different dropouts.

### Dropouts

In [None]:
grid = tl.Grid({
    'epochs': [1000],  # we use early stopping, so this is just a high number
    'lr': [0.0001],
    'model': [LSTMParams],
    'param_group': ['3-20'],
    'n_splits': [6],
    'dropout': [0.3, 0.5],
    'hid_noise': [0.05, 0.1],
}) # val_mod is default at 8

wrapper = tl.MIMOTSWrapper(LSTMParams(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=3)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

It seems that 0.3 dropout and 0.05 noise was the best.

### Test of all features

It might be worth testing if adding back the features filtered in feature selection help us in any way. This way we let the model decide what features are important.
If scores don't increase, I'll keep the feature selection.

In [None]:
df: pd.DataFrame = tl.load_country_wide_dataset('../data/country_data.csv', True)

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)

wrapper = tl.MIMOTSWrapper(LSTMModel(18, hidden_size=20, num_layers=3, bidirectional=True, dropout=0.3, hid_noise=0.05), seq_len=24, pred_len=3)
result = wrapper.validate_ts_strategy(X, y, 1000, lr=0.0001, n_splits=6)

print(sum(result[3]) / len(result[3]), "-", sum(result[3][1:]) / (len(result[3]) - 1))
st = X.shape[0] // 7
tl.MIMOTSWrapper.print_evaluation_info(*wrapper.predict(X[-st:], y[-st:]))


# load back original X and y
df: pd.DataFrame = tl.load_country_wide_dataset('../data/country_data.csv')

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)

This seems to be worse, so I'll keep the feature selection.

### Trying to speed up training.

In [None]:
grid = tl.Grid({
    'epochs': [1000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [LSTMParams],
    'param_group': ['3-20'],
    'dropout': [0.3],
    'hid_noise': [0.05],
    'batch_size': [1024, 2048, 4096],
    'es_p': [20],
}) # n_splits defaulted to 6, val_mod to 8

wrapper = tl.MIMOTSWrapper(LSTMParams(), seq_len=24, pred_len=3)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

### Final

In [None]:
wrapper = tl.MIMOTSWrapper(LSTMParams('3-20', dropout=0.3, hid_noise=0.05), seq_len=24, pred_len=3)
result = wrapper.validate_ts_strategy(X, y, 1000, batch_size=2048, lr=0.001, n_splits=6, es_p=20)

In [None]:
print(sum(result[3]) / len(result[3]), "-", sum(result[3][1:]) / (len(result[3]) - 1))
st = X.shape[0] // 7
tl.MIMOTSWrapper.print_evaluation_info(*wrapper.predict(X[-st:], y[-st:]))