In [None]:
import torch
import pandas as pd
import numpy as np
from transformer import VanillaTimeSeriesTransformer
from utils import Trainer, preprocess_data, seed_everything
from torch.optim import Adam
from torch.nn import MSELoss, L1Loss
from torch.optim.lr_scheduler import ReduceLROnPlateau
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll import scope
import pickle
from collections import OrderedDict
import torch.nn as nn
import time

seed_everything()
cuda_ = 'cuda:3'

# Vanilla Transformer (Next Step Prediction | Encoder & Decoder)

### Hyperparameter Tuning (Structural Tuning Too) for 2 Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
df = df[["close"]]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)

def objective(space):
    start_time = int(time.time())
    print(f"""
    device: {torch.device(cuda_ if torch.cuda.is_available() else 'cpu')},
    num_heads:  {space['num_heads']},
    d_model: {space['d_model_by_num_heads'] * space['num_heads']},
    num_layers: {space['num_layers']},
    dff: {space['dff']},
    mlp_size: {space['mlp_size']},
    dropout_rate: {space['dropout_rate']},
    mlp_dropout_rate: {space['mlp_dropout_rate']}""")

    device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
    num_heads = space['num_heads']
    d_model = space['d_model_by_num_heads'] * num_heads

    params = {
        'num_features': int(len(scaled_df.columns)),
        'd_model': d_model, 
        'num_layers': space['num_layers'], 
        'dff': space['dff'], 
        'dropout_rate': space['dropout_rate'], 
        'mlp_size': space['mlp_size'], 
        'mlp_dropout_rate': space['mlp_dropout_rate'],
        "num_heads":space['num_heads']}
    model = VanillaTimeSeriesTransformer(**params)
    device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimiser = Adam(model.parameters(), lr=space['lr'])
    scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
    criterion = MSELoss()

    model_trainer = Trainer(model=model,
                train_dataloader=train_dataloader,
                val_dataloader=val_dataloader,
                test_dataloader=test_dataloader,
                criterion=criterion,
                optimiser=optimiser,
                scheduler=scheduler,
                device=device,
                num_epochs=50,
                early_stopping_patience_limit=10,
                is_save_model=True,
                scaler=scaler_close,
                file_path = f"models/best_model_{start_time}.pt")
    train_losses, val_losses = model_trainer.train_loop()

    return {'loss': val_losses[-1], 'status': STATUS_OK}


space = {
    'num_layers': scope.int(hp.quniform('num_layers', 1, 8, 1)),
    'num_heads': scope.int(hp.quniform('num_heads', 1, 8, 1)),
    'd_model_by_num_heads': scope.int(hp.quniform('d_model_by_num_heads', 32, 64, 2)),
    'dff': scope.int(hp.quniform('dff', 2, 2048, 50)),
    'mlp_size': scope.int(hp.quniform('mlp_size', 32, 64, 2)),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.3),
    'mlp_dropout_rate': hp.uniform('mlp_dropout_rate', 0.1, 0.3),
    'lr': hp.loguniform('lr', np.log(0.0001), np.log(0.1))
}

# Run the optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)

with open('../data/stats_on_hyperparam_for_two_cols_vanilla_transformer_hourly_encoder_decoder.pkl', 'wb') as file:
    pickle.dump(best, file)



### Finding Top Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/stats_on_hyperparam_for_two_cols_vanilla_transformer_hourly_encoder_decoder.pkl', 'rb') as file:
    best = pickle.load(file)


input_seq_len_ = 36
output_seq_len_ = 1
stats = {}

for cols in df.columns:
  if cols != "close":
    print(cols)

    temp_df = df[["close", cols]]
    scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(temp_df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)
    

    params = {
        'num_features': int(len(scaled_df.columns)),
        'd_model': int(best['d_model_by_num_heads']) * int(best["num_heads"]), 
        'num_layers': int(best["num_layers"]), 
        'dff': int(best['dff']), 
        'dropout_rate': round(best['dropout_rate'], 3), 
        'mlp_size': int(best['mlp_size']), 
        'mlp_dropout_rate': round(best['mlp_dropout_rate'], 3),
        "num_heads":int(best["num_heads"])}
    
    model = VanillaTimeSeriesTransformer(**params)
    device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimiser = Adam(model.parameters(), lr=best['lr'])
    scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
    criterion = MSELoss()

    model_trainer = Trainer(model=model,
                train_dataloader=train_dataloader,
                val_dataloader=val_dataloader,
                test_dataloader=test_dataloader,
                criterion=criterion,
                optimiser=optimiser,
                scheduler=scheduler,
                device=device,
                num_epochs=50,
                early_stopping_patience_limit=10,
                is_save_model=True,
                scaler=scaler_close,
                file_path = "models/best_model_.pt")
    
    train_losses, val_losses = model_trainer.train_loop()
    # testing
    mse, mae = model_trainer.test_model()

    stats[cols] = {
        "mse":mse,
        "mae":mae
    }

with open('../data/stats_on_features_vanilla_transformer_hourly_encoder_decoder.pkl', 'wb') as file:
    pickle.dump(stats, file)


In [None]:
with open('../data/stats_on_features_vanilla_transformer_hourly_encoder_decoder.pkl', 'rb') as file:
    loaded_stats = pickle.load(file)
sorted_loaded_stats = OrderedDict(sorted(loaded_stats.items(), key=lambda item: item[1]['mse']))

count = 0
top_feattures = []
for k,v in sorted_loaded_stats.items():
  if count < 10:
    top_feattures.append(k)
    count += 1
print(top_feattures)
with open('../data/top_feattures_encoder_decoder.pkl', 'wb') as file:
    pickle.dump(top_feattures, file)

### Hyperparameter Tuning (Structural Tuning Too) for Top Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures_encoder_decoder.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)

def objective(space):
    start_time = int(time.time())
    print(f"""
    device: {torch.device(cuda_ if torch.cuda.is_available() else 'cpu')},
    num_heads:  {space['num_heads']},
    d_model: {space['d_model_by_num_heads'] * space['num_heads']},
    num_layers: {space['num_layers']},
    dff: {space['dff']},
    mlp_size: {space['mlp_size']},
    dropout_rate: {space['dropout_rate']},
    mlp_dropout_rate: {space['mlp_dropout_rate']}""")

    device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
    num_heads = space['num_heads']
    d_model = space['d_model_by_num_heads'] * num_heads

    params = {
        'num_features': int(len(scaled_df.columns)),
        'd_model': d_model, 
        'num_layers': space['num_layers'], 
        'dff': space['dff'], 
        'dropout_rate': space['dropout_rate'], 
        'mlp_size': space['mlp_size'], 
        'mlp_dropout_rate': space['mlp_dropout_rate'],
        "num_heads":space['num_heads']}
    model = VanillaTimeSeriesTransformer(**params)
    device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimiser = Adam(model.parameters(), lr=space['lr'])
    scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
    criterion = MSELoss()

    model_trainer = Trainer(model=model,
                train_dataloader=train_dataloader,
                val_dataloader=val_dataloader,
                test_dataloader=test_dataloader,
                criterion=criterion,
                optimiser=optimiser,
                scheduler=scheduler,
                device=device,
                num_epochs=50,
                early_stopping_patience_limit=10,
                is_save_model=True,
                scaler=scaler_close,
                file_path = f"models/best_model_{start_time}.pt")
    train_losses, val_losses = model_trainer.train_loop()

    return {'loss': val_losses[-1], 'status': STATUS_OK}


space = {
    'num_layers': scope.int(hp.quniform('num_layers', 1, 8, 1)),
    'num_heads': scope.int(hp.quniform('num_heads', 1, 8, 1)),
    'd_model_by_num_heads': scope.int(hp.quniform('d_model_by_num_heads', 32, 64, 2)),
    'dff': scope.int(hp.quniform('dff', 2, 2048, 50)),
    'mlp_size': scope.int(hp.quniform('mlp_size', 32, 64, 2)),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.3),
    'mlp_dropout_rate': hp.uniform('mlp_dropout_rate', 0.1, 0.3),
    'lr': hp.loguniform('lr', np.log(0.0001), np.log(0.1))
}

# Run the optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)

with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_encoder_decoder_hourly.pkl', 'wb') as file:
    pickle.dump(best, file)


### Training and Evaluation of Vanilla Transformer (Encoder-Decoder) on Close Price, Top 2, 5, and 10 Features

Close Price Only

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
df = df["close"]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)
with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_encoder_decoder_hourly.pkl', 'rb') as file:
    best = pickle.load(file)


params = {
    'num_features': int(len(scaled_df.columns)),
    'd_model': int(best['d_model_by_num_heads']) * int(best["num_heads"]), 
    'num_layers': int(best["num_layers"]), 
    'dff': int(best['dff']), 
    'dropout_rate': round(best['dropout_rate'], 3), 
    'mlp_size': int(best['mlp_size']), 
    'mlp_dropout_rate': round(best['mlp_dropout_rate'], 3),
    "num_heads":int(best["num_heads"])}

model = VanillaTimeSeriesTransformer(**params)
device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimiser = Adam(model.parameters(), lr=best['lr'])
scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
criterion = MSELoss()

model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=50,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close,
            file_path = "models/best_model_.pt")

train_losses, val_losses = model_trainer.train_loop()
# testing
mse, mae = model_trainer.test_model()

Top 2 Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures_encoder_decoder.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures = top_feattures[:2]
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_encoder_decoder_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

params = {
    'num_features': int(len(scaled_df.columns)),
    'd_model': int(best['d_model_by_num_heads']) * int(best["num_heads"]), 
    'num_layers': int(best["num_layers"]), 
    'dff': int(best['dff']), 
    'dropout_rate': round(best['dropout_rate'], 3), 
    'mlp_size': int(best['mlp_size']), 
    'mlp_dropout_rate': round(best['mlp_dropout_rate'], 3),
    "num_heads":int(best["num_heads"])}

model = VanillaTimeSeriesTransformer(**params)
device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimiser = Adam(model.parameters(), lr=best['lr'])
scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
criterion = MSELoss()

model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=50,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close,
            file_path = "models/best_model_.pt")

train_losses, val_losses = model_trainer.train_loop()
# testing
mse, mae = model_trainer.test_model()

Top 5 Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures_encoder_decoder.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures = top_feattures[:5]
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_encoder_decoder_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

params = {
    'num_features': int(len(scaled_df.columns)),
    'd_model': int(best['d_model_by_num_heads']) * int(best["num_heads"]), 
    'num_layers': int(best["num_layers"]), 
    'dff': int(best['dff']), 
    'dropout_rate': round(best['dropout_rate'], 3), 
    'mlp_size': int(best['mlp_size']), 
    'mlp_dropout_rate': round(best['mlp_dropout_rate'], 3),
    "num_heads":int(best["num_heads"])}

model = VanillaTimeSeriesTransformer(**params)
device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimiser = Adam(model.parameters(), lr=best['lr'])
scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
criterion = MSELoss()

model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=50,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close,
            file_path = "models/best_model_.pt")

train_losses, val_losses = model_trainer.train_loop()
# testing
mse, mae = model_trainer.test_model()

Top 10 Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures_encoder_decoder.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
# top_feattures = top_feattures[:5]
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_encoder_decoder_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

params = {
    'num_features': int(len(scaled_df.columns)),
    'd_model': int(best['d_model_by_num_heads']) * int(best["num_heads"]), 
    'num_layers': int(best["num_layers"]), 
    'dff': int(best['dff']), 
    'dropout_rate': round(best['dropout_rate'], 3), 
    'mlp_size': int(best['mlp_size']), 
    'mlp_dropout_rate': round(best['mlp_dropout_rate'], 3),
    "num_heads":int(best["num_heads"])}

model = VanillaTimeSeriesTransformer(**params)
device = torch.device(cuda_ if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimiser = Adam(model.parameters(), lr=best['lr'])
scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
criterion = MSELoss()

model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=50,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close,
            file_path = "models/best_model_.pt")

train_losses, val_losses = model_trainer.train_loop()
# testing
mse, mae = model_trainer.test_model()

# Vanilla Transformer (Next 24th Step Prediction | Encoder - Decoder)