### Imports

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
import trainer_lib as tl
import torch_model_definitions as tmd

### Load data

In [2]:
df: pd.DataFrame = tl.load_country_wide_dataset('../data/country_data.csv', until='2019-12-31 23:00:00')
# I'll limit the amount of data to make it run faster
# Models generally performed the best on this part of the dataset in my TDK

X = df.to_numpy(dtype=np.float32)
y = df['el_load'].to_numpy(dtype=np.float32)

# Grid search

### Seq2seq

In [14]:
# this is the setup that performed the best in my TDK with 24 seq_len and 3 pred_len, I won't grid search it further, it seems to perfrom well
grid = tl.Grid({
    'epochs': [2000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [tmd.Seq2seq],
    'embedding_size': [10],
    'num_layers': [1],
    'bidirectional': [True],
    'dropout': [0.5],
    'out_noise': [0.05],
    'batch_size': [2048],
    'pred_len': [12],
    'es_p': [20]
}) # n_splits defaulted to 2, val_mod to 8

wrapper = tl.S2STSWrapper(tmd.Seq2seq(), seq_len=24, pred_len=12, teacher_forcing_decay=0.01)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

[Grid search 001] BEGIN - params: {'epochs': 1000, 'lr': 0.001, 'model': <class 'torch_model_definitions.Seq2seq'>, 'embedding_size': 10, 'num_layers': 1, 'bidirectional': True, 'dropout': 0.5, 'out_noise': 0.05, 'batch_size': 2048, 'pred_len': 12, 'es_p': 20}
[Fold 2] BEGIN
Early stopping... Epoch 449: train loss: 0.035275, val loss: 0.059079, test loss: 0.074390
[Fold 2] END - RMSE loss: 167.899 - Time: 6.1 min.
[Grid search 001] END - Score: 167.89887791 * 

Best params: {'epochs': 1000, 'lr': 0.001, 'model': <class 'torch_model_definitions.Seq2seq'>, 'embedding_size': 10, 'num_layers': 1, 'bidirectional': True, 'dropout': 0.5, 'out_noise': 0.05, 'batch_size': 2048, 'pred_len': 12, 'es_p': 20}
Best score: 167.89887790907062


### AttentionSeq2seq

In [4]:
grid = tl.Grid({
    'epochs': [2000],  # we use early stopping, so this is just a high number
    'lr': [0.002],
    'model': [tmd.AttentionSeq2seq],
    'embedding_size': [8, 10, 12],
    'bidirectional': [False, True],
    'dropout': [0.0],  # set to 0, the model wasn't showing major signs of overfitting
    'out_noise': [0.00],  # set to 0, the model wasn't showing major signs of overfitting
    'batch_size': [2048],
    'pred_len': [12],
    'es_p': [20]
}) # n_splits defaulted to 2, val_mod to 8

wrapper = tl.S2STSWrapper(tmd.AttentionSeq2seq(), seq_len=24, pred_len=12, teacher_forcing_decay=0.01)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

[Grid search 001] BEGIN - params: {'epochs': 2000, 'lr': 0.002, 'model': <class 'torch_model_definitions.AttentionSeq2seq'>, 'embedding_size': 8, 'bidirectional': False, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 20}
[Fold 2] BEGIN
Early stopping... Epoch 1148: train loss: 0.029809, val loss: 0.052003, test loss: 0.072072
[Fold 2] END - RMSE loss: 157.203 - Time: 13.2 min.
[Grid search 001] END - Score: 157.20342862 * 
[Grid search 002] BEGIN - params: {'epochs': 2000, 'lr': 0.002, 'model': <class 'torch_model_definitions.AttentionSeq2seq'>, 'embedding_size': 10, 'bidirectional': False, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 20}
[Fold 2] BEGIN
Early stopping... Epoch 302: train loss: 0.036287, val loss: 0.064952, test loss: 0.081563
[Fold 2] END - RMSE loss: 176.641 - Time: 3.5 min.
[Grid search 002] END - Score: 176.64097813 
[Grid search 003] BEGIN - params: {'epochs': 2000, 'lr': 0.002, 'model': <class 'torch_

### Attention and positional encoding seq2seq

In [5]:
grid = tl.Grid({
    'epochs': [2000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [tmd.PosAttSeq2seq],
    'embedding_size': [10, 12, 14],
    'bidirectional': [False, True],
    'dropout': [0.0],
    'out_noise': [0.0],
    'batch_size': [2048],
    'pred_len': [12],
    'es_p': [15]
}) # n_splits defaulted to 2, val_mod to 8

wrapper = tl.S2STSWrapper(tmd.PosAttSeq2seq(), seq_len=24, pred_len=12, teacher_forcing_decay=0.01)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

[Grid search 001] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definitions.PosAttSeq2seq'>, 'embedding_size': 10, 'bidirectional': False, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 1081: train loss: 0.028514, val loss: 0.046966, test loss: 0.060109
[Fold 2] END - RMSE loss: 154.056 - Time: 12.0 min.
[Grid search 001] END - Score: 154.05562231 * 
[Grid search 002] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definitions.PosAttSeq2seq'>, 'embedding_size': 12, 'bidirectional': False, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 1143: train loss: 0.022216, val loss: 0.043605, test loss: 0.058386
[Fold 2] END - RMSE loss: 156.615 - Time: 12.9 min.
[Grid search 002] END - Score: 156.61451855 
[Grid search 003] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_mod

### Direct comparison between PE and non-PE

In [6]:
grid = tl.Grid({
    'epochs': [2000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [tmd.AttentionSeq2seq, tmd.PosAttSeq2seq],
    'embedding_size': [10],
    'bidirectional': [False, True],
    'dropout': [0.0],
    'out_noise': [0.0],
    'batch_size': [2048],
    'pred_len': [12],
    'es_p': [15]
}) # n_splits defaulted to 2, val_mod to 8

wrapper = tl.S2STSWrapper(tmd.AttentionSeq2seq(), seq_len=24, pred_len=12, teacher_forcing_decay=0.01)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

[Grid search 001] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definitions.AttentionSeq2seq'>, 'embedding_size': 10, 'bidirectional': False, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 1123: train loss: 0.025884, val loss: 0.052400, test loss: 0.058205
[Fold 2] END - RMSE loss: 157.440 - Time: 12.0 min.
[Grid search 001] END - Score: 157.43972061 * 
[Grid search 002] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definitions.PosAttSeq2seq'>, 'embedding_size': 10, 'bidirectional': False, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 1956: train loss: 0.020990, val loss: 0.043597, test loss: 0.049572
[Fold 2] END - RMSE loss: 151.122 - Time: 21.8 min.
[Grid search 002] END - Score: 151.12160098 * 
[Grid search 003] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torc

Bidirectional models perform better in this case too, but the difference for the regular attention model is small.
Positional encoding might not be required because the input already has time components, but further testing is required.

### Longer sequence length

In [4]:
grid = tl.Grid({
    'epochs': [2000],  # we use early stopping, so this is just a high number
    'lr': [0.001],
    'model': [tmd.Seq2seq, tmd.PosAttSeq2seq],
    'embedding_size': [10, 12],
    'bidirectional': [True],
    'dropout': [0.0],
    'out_noise': [0.0],
    'batch_size': [2048],
    'pred_len': [12],
    'es_p': [15]
}) # n_splits defaulted to 2, val_mod to 8

wrapper = tl.S2STSWrapper(tmd.Seq2seq(), seq_len=48, pred_len=12, teacher_forcing_decay=0.01)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

[Grid search 001] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definitions.Seq2seq'>, 'embedding_size': 10, 'bidirectional': True, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 617: train loss: 0.028289, val loss: 0.052054, test loss: 0.088466
[Fold 2] END - RMSE loss: 176.173 - Time: 7.8 min.
[Grid search 001] END - Score: 176.17316588 * 
[Grid search 002] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definitions.PosAttSeq2seq'>, 'embedding_size': 10, 'bidirectional': True, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 429: train loss: 0.032189, val loss: 0.060340, test loss: 0.075414
[Fold 2] END - RMSE loss: 175.832 - Time: 5.9 min.
[Grid search 002] END - Score: 175.83212319 * 
[Grid search 003] BEGIN - params: {'epochs': 2000, 'lr': 0.001, 'model': <class 'torch_model_definit

This particular run produced interesting results, but the higher embedding size seems to help in longer sequence lengths.

In [12]:
grid = tl.Grid({
    'epochs': [2000],  # we use early stopping, so this is just a high number
    'lr': [0.002],
    'model': [tmd.AttentionSeq2seq],
    'embedding_size': [10, 12],
    'bidirectional': [True],
    'dropout': [0.0],
    'out_noise': [0.0],
    'batch_size': [2048],
    'pred_len': [12],
    'es_p': [15]
}) # n_splits defaulted to 2, val_mod to 8

wrapper = tl.S2STSWrapper(tmd.Seq2seq(), seq_len=48, pred_len=12, teacher_forcing_decay=0.01)
b_p, b_s = wrapper.grid_search(X, y, grid, verbose=4)
print(f"\nBest params: {b_p}\nBest score: {b_s}")

[Grid search 001] BEGIN - params: {'epochs': 2000, 'lr': 0.002, 'model': <class 'torch_model_definitions.AttentionSeq2seq'>, 'embedding_size': 10, 'bidirectional': True, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 166: train loss: 0.038162, val loss: 0.069109, test loss: 0.076788
[Fold 2] END - RMSE loss: 184.082 - Time: 2.5 min.
[Grid search 001] END - Score: 184.08244799 * 
[Grid search 002] BEGIN - params: {'epochs': 2000, 'lr': 0.002, 'model': <class 'torch_model_definitions.AttentionSeq2seq'>, 'embedding_size': 12, 'bidirectional': True, 'dropout': 0.0, 'out_noise': 0.0, 'batch_size': 2048, 'pred_len': 12, 'es_p': 15}
[Fold 2] BEGIN
Early stopping... Epoch 285: train loss: 0.027887, val loss: 0.054195, test loss: 0.079248
[Fold 2] END - RMSE loss: 165.895 - Time: 4.7 min.
[Grid search 002] END - Score: 165.89517469 * 

Best params: {'epochs': 2000, 'lr': 0.002, 'model': <class 'torch_model_definitions.Att