In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import pretreat_data
import seaborn as sns

from utils import rolling_train_valid_split
from dataset import TimeSeriesDataset
from torch import nn, optim
from torch.utils.data import DataLoader
import torch

# use importlib magic so that if changes is made in the models, no need to restart the jupyter notebook
from importlib import reload
import models
import trainer
import transformer_model
reload(models)
reload(trainer)
reload(transformer_model)

from trainer import train, validate, test
from transformer_model import TransformerForecaster, DecoderForecaster
from models import RNN_model, LSTM_model

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

sns.set_style("darkgrid")
colors = sns.color_palette("colorblind6")

# Path to the data
train_path = "../data/ClimateTrain.csv"
test_path = "../data/ClimateTest.csv"

# Read as pandas dataframe and transform dates to datetime
train_df_raw = pd.read_csv(train_path)
train_df_raw['date'] = pd.to_datetime(train_df_raw['date'])

test_df_raw = pd.read_csv(test_path)
test_df_raw['date'] = pd.to_datetime(test_df_raw['date'])

# Plot description
train_df_raw.describe()
test_df_raw.describe()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
count,114,114.0,114.0,114.0,114.0
mean,2017-02-26 12:00:00,21.713079,56.258362,8.143924,1004.03509
min,2017-01-01 00:00:00,11.0,17.75,1.3875,59.0
25%,2017-01-29 06:00:00,16.437198,39.625,5.563542,1007.4375
50%,2017-02-26 12:00:00,19.875,57.75,8.069444,1012.739316
75%,2017-03-26 18:00:00,27.705357,71.902778,10.06875,1016.739583
max,2017-04-24 00:00:00,34.5,95.833333,19.314286,1022.809524
std,,6.360072,19.068083,3.588049,89.474692


## Pretreat the data

In [2]:
train_df, test_df, norm_stats = pretreat_data(train_df_raw, test_df_raw)

Pre-treating data...
{'meantemp': {'mean': 25.495520655761762, 'std': 7.348102725432476}, 'humidity': {'mean': 60.749546508715525, 'std': 16.79073354358986}, 'wind_speed': {'mean': 6.634903591220332, 'std': 4.24227584263327}, 'meanpressure': {'mean': 1008.2364359217448, 'std': 7.4472597886225085}}


# Metrics

In [4]:
def RMSE(gt, pred):
    return np.sqrt(((pred - gt)**2).mean())

def SMAPE(gt, pred):
    return ( (np.abs(pred - gt)) / ((np.abs(gt) + np.abs(pred))/2) ).mean()

def form_testdata(window_size, horizon):
    # Add The required windows size to the test df so that we can start predicting from the beginning
    test_horizon = horizon
    test_start_date = test_df["date"].loc[0]
    test_window_date = test_start_date - pd.Timedelta(days=window_size+test_horizon)
    filtered_train_df = train_df[(train_df['date'] > test_window_date) & (train_df['date'] < test_start_date)]
    test_df_altered = pd.concat([filtered_train_df, test_df], ignore_index=True)
    return test_df_altered


# 1.1 RNN and LSTM models

(Transformer at section 1.2)

## Define the parameter space

In [None]:
from itertools import product
import logging
import os

# RNN & LSTM
hyperparameter_space = {
    "window_size": [5, 10, 20, 30, 180],
    "hidden_size": [8, 32, 64, 128, 256],
    "embed_dim": [8, 32, 64, 128, 256],
    "n_layers": [1, 2, 4, 8],
    "dropout_p": [0.2, 0.4, 0.6, 0.8],
    "date_as_var": [True, False]
}

# Set a baseline configuration
baseline = {
    "window_size": 20,
    "hidden_size": 128,
    "embed_dim": 128,
    "n_layers": 2,
    "dropout_p": 0.2,
    "output_size": 1,
    "date_as_var": True
}

# Generate all combinations of hyperparameters
configs = list(product(
    hyperparameter_space['window_size'],
    hyperparameter_space['hidden_size'],
    hyperparameter_space['embed_dim'],
    hyperparameter_space['n_layers'],
    hyperparameter_space['dropout_p'],
    hyperparameter_space['date_as_var'],
))

# Generate configurations for ablation study
ablation_configs = []

for param, values in hyperparameter_space.items():
    for value in values:
        # Create a copy of the baseline and modify the current parameter
        config = baseline.copy()
        config[param] = value
        ablation_configs.append(config)

print("Generated configurations for ablation:")
for config in ablation_configs[:5]:
    print(config)

# Define model
model_type = "LSTM"

# Set Horizon
horizon = 1
batch_size = 128
n_epochs = 100
lr = 0.0001

# Configure logging
os.makedirs("./logs", exist_ok=True)
logging.basicConfig(
    filename=f"./logs/training_{model_type}.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

Generated configurations for ablation:
{'window_size': 5, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 10, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 20, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 30, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 180, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}


#

In [None]:

for idx, config in enumerate(ablation_configs):
    torch.manual_seed(42)
    print(config)
    window_size = config["window_size"]
    hidden_size = config["hidden_size"]
    embed_dim = config["embed_dim"]
    n_layers = config["n_layers"]
    dropout_p = config["dropout_p"] 
    output_size = config["output_size"]
    date_as_var = config["date_as_var"]
    
    # Log the start of the experiment with hyperparameter details
    logging.info(
        f"Starting Experiment {idx + 1}/{len(ablation_configs)}\n"
        f"Hyperparameters:\n"
        f"  window_size={window_size}, hidden_size={hidden_size}, embed_dim={embed_dim},"
        f"  n_layers={n_layers}, dropout_p={dropout_p}, output_size={output_size},"
        f"  date_as_var={date_as_var}"
    )

    if date_as_var:
        input_size = 6
    else:
        input_size = 4

    match model_type:
        case "RNN":
            model = RNN_model(hidden_size=hidden_size, embed_dim = embed_dim, n_layers=n_layers, input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case "LSTM":
            model = LSTM_model(hidden_size=hidden_size, embed_dim = embed_dim, n_layers=n_layers, input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case "Transformer":
            model = model = TransformerForecaster(hidden_size=hidden_size, embed_dim = embed_dim, 
                                encoder_heads=4, encoder_depth=4,
                                decoder_heads=4, decoder_depth=4, 
                                input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case "Decoder":
            model = DecoderForecaster(hidden_size=hidden_size, embed_dim = embed_dim, 
                            decoder_heads=2, decoder_depth=1, 
                            input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case _:
                "Something's wrong with the model_type"

    model = model.to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr, weight_decay=0.03)

        # Generate rolling train-validation splits
    loss_history = {0: {"train": [], "valid": []}, 
                    1: {"train": [], "valid": []}, 
                    2: {"train": [], "valid": []},
                    3: {"train": []}}
    for i, (train_data, valid_data) in enumerate(rolling_train_valid_split(train_df, months=6, window_size=window_size, horizon=horizon)):
        # print("Training fold ", i)
        train_dataset = TimeSeriesDataset(df=train_data, window_size=window_size, horizon=horizon, date_as_var=date_as_var)
        if valid_data is not None:
            valid_dataset = TimeSeriesDataset(df=valid_data, window_size=window_size, horizon=horizon, date_as_var=date_as_var)
        
        # Initialize the dataloaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

        avg_val_loss = 0
        for epoch in range(n_epochs):
            avg_train_loss = train(model, train_loader, criterion, optimizer, device, model_type)
            
            loss_history[i]["train"].append(avg_train_loss)
            
            if valid_data is not None:
                avg_val_loss = validate(model, valid_loader, criterion, device, model_type)
                loss_history[i]["valid"].append(avg_val_loss)
            # print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")


    # Create the test dataset
    test_df_altered = form_testdata(window_size, horizon)
    test_dataset = TimeSeriesDataset(df=test_df_altered, window_size=window_size, horizon=horizon, date_as_var=date_as_var)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Test the model
    gt, pred = test(model, test_dataset, device, model_type)

    gt = gt*norm_stats["humidity"]["std"] + norm_stats["humidity"]["mean"]
    pred = pred*norm_stats["humidity"]["std"] + norm_stats["humidity"]["mean"]

    mean_rmse = RMSE(gt, pred)
    mean_smape = SMAPE(gt, pred)
    logging.info(f"RMSE, SMAPE: {mean_rmse:.2f} & {mean_smape:.3f}")
    
    print(f"RMSE: {mean_rmse:.2f}")
    print(f"SMAPE: {mean_smape:.3f}")

{'window_size': 5, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 13.77
SMAPE: 0.228
{'window_size': 10, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 12.06
SMAPE: 0.201
{'window_size': 20, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 12.16
SMAPE: 0.203
{'window_size': 30, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 12.48
SMAPE: 0.208
{'window_size': 180, 'hidden_size': 128, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 20.00
SMAPE: 0.306
{'window_size': 20, 'hidden_size': 8, 'embed_dim': 128, 'n_layers': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}


KeyboardInterrupt: 

# 1.2 Transformer models

In [5]:
from itertools import product
import logging
import os

# RNN & LSTM
hyperparameter_space = {
    "window_size": [5, 10, 20, 30],
    "hidden_size": [8, 32, 64, 128, 256],
    "embed_dim":   [8, 32, 64, 128, 256],
    "encoder_heads": [1, 2, 4, 8],
    "encoder_depth": [1, 2, 4, 8],
    "decoder_heads": [1, 2, 4, 8],
    "decoder_depth": [1, 2, 4, 8],
    "dropout_p": [0.2, 0.4, 0.6, 0.8],
    "date_as_var": [True, False]
}

# Set a baseline configuration
baseline = {
    "window_size": 20,
    "hidden_size": 128,
    "embed_dim": 128,
    "encoder_heads": 4,
    "encoder_depth": 2,
    "decoder_heads": 4,
    "decoder_depth": 2,
    "dropout_p": 0.2,
    "output_size": 1,
    "date_as_var": True
}

# Generate all combinations of hyperparameters
configs = list(product(
    hyperparameter_space['window_size'],
    hyperparameter_space['hidden_size'],
    hyperparameter_space['embed_dim'],
    hyperparameter_space['encoder_heads'],
    hyperparameter_space['encoder_depth'],
    hyperparameter_space['decoder_heads'],
    hyperparameter_space['decoder_depth'],
    hyperparameter_space['dropout_p'],
    hyperparameter_space['date_as_var'],
))

# Generate configurations for ablation study
ablation_configs = []

for param, values in hyperparameter_space.items():
    for value in values:
        # Create a copy of the baseline and modify the current parameter
        config = baseline.copy()
        config[param] = value
        ablation_configs.append(config)

print("Generated configurations for ablation:")
for config in ablation_configs[:5]:
    print(config)

# Define model
model_type = "Decoder"

# Set Horizon
horizon = 1
batch_size = 128
n_epochs = 100
lr = 0.0001

# Configure logging
os.makedirs("./logs", exist_ok=True)
logging.basicConfig(
    filename=f"./logs/training_{model_type}.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

Generated configurations for ablation:
{'window_size': 5, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 10, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 20, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 30, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
{'window_size': 20, 'hidden_size': 8, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_v

In [7]:

for idx, config in enumerate(ablation_configs):
    torch.manual_seed(42)
    print(config)
    window_size = config["window_size"]
    hidden_size = config["hidden_size"]
    embed_dim = config["embed_dim"]
    encoder_heads = config["encoder_heads"]
    encoder_depth = config["encoder_depth"]
    decoder_heads = config["decoder_heads"]
    decoder_depth = config["decoder_depth"]
    dropout_p = config["dropout_p"] 
    output_size = config["output_size"]
    date_as_var = config["date_as_var"]
    
    # Log the start of the experiment with hyperparameter details
    logging.info(
        f"Starting Experiment {idx + 1}/{len(ablation_configs)}\n"
        f"Hyperparameters:\n"
        f"  window_size={window_size}, hidden_size={hidden_size}, embed_dim={embed_dim},"
        f"  encoder_heads={encoder_heads}, encoder_depth={encoder_depth}, "
        f"  decoder_heads={decoder_heads}, decoder_depth={decoder_depth}, "
        f"  dropout_p={dropout_p}, output_size={output_size},"
        f"  date_as_var={date_as_var}"
    )

    if date_as_var:
        input_size = 6
    else:
        input_size = 4

    match model_type:
        case "RNN":
            model = RNN_model(hidden_size=hidden_size, embed_dim = embed_dim, n_layers=n_layers, input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case "LSTM":
            model = LSTM_model(hidden_size=hidden_size, embed_dim = embed_dim, n_layers=n_layers, input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case "Transformer":
            model = model = TransformerForecaster(hidden_size=hidden_size, embed_dim = embed_dim, 
                                encoder_heads=encoder_heads, encoder_depth=encoder_depth,
                                decoder_heads=decoder_heads, decoder_depth=decoder_depth, 
                                input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case "Decoder":
            model = DecoderForecaster(hidden_size=hidden_size, embed_dim = embed_dim, 
                            decoder_heads=decoder_heads, decoder_depth=decoder_depth, 
                            input_size=input_size, out_features=output_size, dropout_p=dropout_p)
        case _:
                "Something's wrong with the model_type"

    model = model.to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr, weight_decay=0.03)

        # Generate rolling train-validation splits
    loss_history = {0: {"train": [], "valid": []}, 
                    1: {"train": [], "valid": []}, 
                    2: {"train": [], "valid": []},
                    3: {"train": []}}
    for i, (train_data, valid_data) in enumerate(rolling_train_valid_split(train_df, months=6, window_size=window_size, horizon=horizon)):
        # print("Training fold ", i)
        train_dataset = TimeSeriesDataset(df=train_data, window_size=window_size, horizon=horizon, date_as_var=date_as_var)
        if valid_data is not None:
            valid_dataset = TimeSeriesDataset(df=valid_data, window_size=window_size, horizon=horizon, date_as_var=date_as_var)
        
        # Initialize the dataloaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

        avg_val_loss = 0
        for epoch in range(n_epochs):
            avg_train_loss = train(model, train_loader, criterion, optimizer, device, model_type)
            
            loss_history[i]["train"].append(avg_train_loss)
            
            if valid_data is not None:
                avg_val_loss = validate(model, valid_loader, criterion, device, model_type)
                loss_history[i]["valid"].append(avg_val_loss)
            # print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")


    # Create the test dataset
    test_df_altered = form_testdata(window_size, horizon)
    test_dataset = TimeSeriesDataset(df=test_df_altered, window_size=window_size, horizon=horizon, date_as_var=date_as_var)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Test the model
    gt, pred = test(model, test_dataset, device, model_type)

    gt = gt*norm_stats["humidity"]["std"] + norm_stats["humidity"]["mean"]
    pred = pred*norm_stats["humidity"]["std"] + norm_stats["humidity"]["mean"]

    mean_rmse = RMSE(gt, pred)
    mean_smape = SMAPE(gt, pred)
    logging.info(f"RMSE, SMAPE: {mean_rmse:.2f} & {mean_smape:.3f}")
    
    print(f"RMSE: {mean_rmse:.2f}")
    print(f"SMAPE: {mean_smape:.3f}")

{'window_size': 5, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 7.29
SMAPE: 0.119
{'window_size': 10, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 7.24
SMAPE: 0.120
{'window_size': 20, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 7.04
SMAPE: 0.114
{'window_size': 30, 'hidden_size': 128, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_depth': 2, 'dropout_p': 0.2, 'output_size': 1, 'date_as_var': True}
RMSE: 7.07
SMAPE: 0.113
{'window_size': 20, 'hidden_size': 8, 'embed_dim': 128, 'encoder_heads': 4, 'encoder_depth': 2, 'decoder_heads': 4, 'decoder_