In [1]:
import torch
import pandas as pd
import numpy as np
from transformer import VanillaTimeSeriesTransformer_EncoderOnly
from utils import Trainer, preprocess_data, seed_everything
from torch.optim import Adam
from torch.nn import MSELoss, L1Loss
from torch.optim.lr_scheduler import ReduceLROnPlateau
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll import scope
import pickle
from collections import OrderedDict
import torch.nn as nn

seed_everything()

# Vanilla Transformer (Next Step Prediction | Encoder only)

### Hyperparamer Tuning (Structural Tuning Too) for 2 Features

In [2]:
df = pd.read_csv("../data/Final_data_hourly.csv")
df

Unnamed: 0,close,high,low,open,forex_sentiment_score,forex_total,stock_sentiment_score,stock_total,btc_sentiment_score,btc_Total,...,NG=F,SI=F,ZW=F,DFF,CPIAUCSL,SMA_24_hourly,RSI_24_hourly,MACD_hourly,SMA_168_hourly,RSI_168_hourly
0,4.58,4.58,4.58,4.58,0.00,0.0,0.000,0.0,0.000,0.0,...,2.7151,28.512000,648.925,0.04,227.842,4.580000,100.000000,0.105703,5.570774,71.315357
1,4.58,4.58,4.58,4.58,0.00,0.0,0.000,0.0,0.000,0.0,...,2.7151,28.512000,648.925,0.04,227.842,4.580000,100.000000,0.105703,5.570774,71.315357
2,4.58,4.58,4.58,4.58,0.00,0.0,0.000,0.0,0.000,0.0,...,2.7151,28.512000,648.925,0.04,227.842,4.580000,100.000000,0.105703,5.570774,71.315357
3,4.58,4.58,4.58,4.58,0.00,0.0,0.000,0.0,0.000,0.0,...,2.7151,28.512000,648.925,0.04,227.842,4.580000,100.000000,0.105703,5.570774,71.315357
4,4.58,4.58,4.58,4.58,0.00,0.0,0.000,0.0,0.000,0.0,...,2.7151,28.512000,648.925,0.04,227.842,4.580000,100.000000,0.105703,5.570774,71.315357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99788,27025.00,27097.00,27016.00,27082.00,-0.27,139.0,0.278,2365.0,0.407,118.0,...,2.3180,25.009001,603.750,4.83,303.294,26954.071429,57.095624,40.279134,27033.416667,48.544716
99789,27045.00,27060.00,26983.00,27023.00,-0.27,139.0,0.278,2365.0,0.407,118.0,...,2.3180,25.009001,603.750,4.83,303.294,26967.071429,58.517086,42.195293,27034.452381,48.636546
99790,27077.00,27082.00,27022.00,27041.00,-0.27,139.0,0.278,2365.0,0.407,118.0,...,2.3180,25.009001,603.750,4.83,303.294,26978.428571,60.757339,45.768407,27035.464286,48.783669
99791,27115.00,27139.00,27061.00,27073.00,-0.27,139.0,0.278,2365.0,0.407,118.0,...,2.3180,25.009001,603.750,4.83,303.294,26994.500000,63.292474,51.077616,27037.404762,48.958322


In [3]:
df = df[["close", "open"]]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)
def objective(params):
    print(f"""
    device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')},
    num_heads:  {params['num_heads']},
    d_model: {params['d_model_by_num_heads'] * params['num_heads']},
    num_layers: {params['num_layers']},
    dff: {params['dff']},
    mlp_size: {params['mlp_size']},
    dropout_rate: {params['dropout_rate']},
    mlp_dropout_rate: {params['mlp_dropout_rate']}""")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Use the parameters to define your model, train it and evaluate it.
    num_heads = params['num_heads']
    d_model = params['d_model_by_num_heads'] * num_heads
    model = VanillaTimeSeriesTransformer_EncoderOnly(
        num_features=int(len(scaled_df.columns)),
        num_layers=params['num_layers'],
        d_model=d_model,
        num_heads=num_heads,
        dff=params['dff'],
        input_seq_len=36,
        output_seq_len=1,
        mlp_size=params['mlp_size'],
        dropout_rate=params['dropout_rate'],
        mlp_dropout_rate=params['mlp_dropout_rate']
    )
    model = model.to(device)

    optimiser = Adam(model.parameters(), lr=params['lr'])
    scheduler = ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)
    criterion = MSELoss()
                 
    model_trainer = Trainer(model=model,
                    train_dataloader=train_dataloader,
                    val_dataloader=val_dataloader,
                    test_dataloader=test_dataloader,
                    criterion=criterion,
                    optimiser=optimiser,
                    scheduler=scheduler,
                    device=device,
                    num_epochs=50,
                    early_stopping_patience_limit=10,
                    is_save_model=True,
                    scaler=scaler_close)

    train_losses, val_losses = model_trainer.train_loop()

    # Return the last validation loss from the training loop
    return {'loss': val_losses[-1], 'status': STATUS_OK}

space = {
    'num_layers': scope.int(hp.quniform('num_layers', 1, 8, 1)),
    'num_heads': scope.int(hp.quniform('num_heads', 1, 8, 1)),
    'd_model_by_num_heads': scope.int(hp.quniform('d_model_by_num_heads', 32, 64, 2)),
    'dff': scope.int(hp.quniform('dff', 2, 2048, 1)),
    'mlp_size': scope.int(hp.quniform('mlp_size', 32, 64, 1)),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.3),
    'mlp_dropout_rate': hp.uniform('mlp_dropout_rate', 0.1, 0.3),
    'lr': hp.loguniform('lr', np.log(0.0001), np.log(0.1))
}

# Run the optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)

with open('../data/stats_on_hyperparam_for_two_cols_vanilla_transformer_hourly_encoder_only.pkl', 'wb') as file:
    pickle.dump(best, file)


torch.Size([89781, 36, 2])
torch.Size([89781, 1, 1])
                                                       
    device: cuda,
    num_heads:  8,
    d_model: 336,
    num_layers: 4,
    dff: 1030,
    mlp_size: 47,
    dropout_rate: 0.27140072343193467,
    mlp_dropout_rate: 0.1761659400132718
Current MAE 0.2634594228474159                         
Best MAE inf                                           
Model saved. MAE: 0.2634594228474159                   
Current MAE 0.2434671028384994                         
Best MAE 0.2634594228474159                            
Model saved. MAE: 0.2434671028384994                   
Current MAE 0.25996684491538774                        
Best MAE 0.2434671028384994                            
  0%|          | 0/100 [03:09<?, ?trial/s, best loss=?]

### Finding Top Features

In [None]:
seed_everything()
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/stats_on_hyperparam_for_two_cols_vanilla_transformer_hourly_encoder_only.pkl', 'rb') as file:
    best = pickle.load(file)

In [None]:
input_seq_len_ = 36
output_seq_len_ = 1
stats = {}
for cols in df.columns:
  if cols != "close":
    print(cols)
    # prepare temp_df
    temp_df = df[["close", cols]]

    # preprocesing temp_df
    scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(temp_df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)

    # instantiate model (after hyperparameter tuning)
    
    num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
    num_layers = int(best["num_layers"])
    num_heads = int(best["num_heads"])
    d_model = int(best['d_model_by_num_heads']) * num_heads
    dff = int(best['dff'])
    mlp_size = int(best['mlp_size']) # size of the first MLP layer
    dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
    mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

    # instantiating model
    model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                  output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

    # moving the model to the device (GPU if available)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)


    criterion = MSELoss()
    optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

    # declaring trainer object
    model_trainer = Trainer(model=model,
                  train_dataloader=train_dataloader,
                  val_dataloader=val_dataloader,
                  test_dataloader=test_dataloader,
                  criterion=criterion,
                  optimiser=optimiser,
                  scheduler=scheduler,
                  device=device,
                  num_epochs=50,
                  early_stopping_patience_limit=10,
                  is_save_model=True,
                  scaler=scaler_close)
    # training
    train_losses, val_losses = model_trainer.train_loop()
    # testing
    mse, mae = model_trainer.test_model()

    stats[cols] = {
        "mse":mse,
        "mae":mae
    }

with open('../data/stats_on_features_vanilla_transformer_hourly.pkl', 'wb') as file:
    pickle.dump(stats, file)

In [None]:
with open('../data/stats_on_features_vanilla_transformer_hourly.pkl', 'rb') as file:
    loaded_stats = pickle.load(file)
sorted_loaded_stats = OrderedDict(sorted(loaded_stats.items(), key=lambda item: item[1]['mse']))

# pprint(sorted_loaded_stats)

count = 0
top_feattures = []
for k,v in sorted_loaded_stats.items():
  if count < 10:
    top_feattures.append(k)
    count += 1
print(top_feattures)
with open('../data/top_feattures.pkl', 'wb') as file:
    pickle.dump(top_feattures, file)


### Hyperparamer Tuning (Structural Tuning Too) for Top Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures.append("close")
df = df[top_feattures]

In [None]:
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)
def objective(params):
    print(f"""
    device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')},
    num_heads:  {params['num_heads']},
    d_model: {params['d_model_by_num_heads'] * params['num_heads']},
    num_layers: {params['num_layers']},
    dff: {params['dff']},
    mlp_size: {params['mlp_size']},
    dropout_rate: {params['dropout_rate']},
    mlp_dropout_rate: {params['mlp_dropout_rate']}""")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Use the parameters to define your model, train it and evaluate it.
    num_heads = params['num_heads']
    d_model = params['d_model_by_num_heads'] * num_heads
    model = VanillaTimeSeriesTransformer_EncoderOnly(
        num_features=int(len(scaled_df.columns)),
        num_layers=params['num_layers'],
        d_model=d_model,
        num_heads=num_heads,
        dff=params['dff'],
        input_seq_len=36,
        output_seq_len=1,
        mlp_size=params['mlp_size'],
        dropout_rate=params['dropout_rate'],
        mlp_dropout_rate=params['mlp_dropout_rate']
    )
    model = model.to(device)

    optimiser = Adam(model.parameters(), lr=params['lr'])
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=5)
    criterion = MSELoss()

    model_trainer = Trainer(model=model,
                train_dataloader=train_dataloader,
                val_dataloader=val_dataloader,
                test_dataloader=test_dataloader,
                criterion=criterion,
                optimiser=optimiser,
                scheduler=scheduler,
                device=device,
                num_epochs=50,
                early_stopping_patience_limit=10,
                is_save_model=True,
                scaler=scaler_close)

    train_losses, val_losses = model_trainer.train_loop()

    # Return the last validation loss from the training loop
    return {'loss': val_losses[-1], 'status': STATUS_OK}

space = {
    'num_layers': scope.int(hp.quniform('num_layers', 1, 8, 1)),
    'num_heads': scope.int(hp.quniform('num_heads', 1, 8, 1)),
    'd_model_by_num_heads': scope.int(hp.quniform('d_model_by_num_heads', 32, 64, 2)),
    'dff': scope.int(hp.quniform('dff', 2, 2048, 1)),
    'mlp_size': scope.int(hp.quniform('mlp_size', 32, 64, 1)),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.3),
    'mlp_dropout_rate': hp.uniform('mlp_dropout_rate', 0.1, 0.3),
    'lr': hp.loguniform('lr', np.log(0.0001), np.log(0.1))
}

# Run the optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)

with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'wb') as file:
    pickle.dump(best, file)


### Training and Evaluation of Vanilla Transformer (Encoder only) on Close Price, Top 2, 5, 10

Close Price Only

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
# top_feattures = top_feattures[:1]
top_feattures.append("close")
df = df[["close"]]
# df
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

# instantiating model (after hyperparameter tuning)

num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
num_layers = int(best["num_layers"])
num_heads = int(best["num_heads"])
d_model = int(best['d_model_by_num_heads']) * num_heads
dff = int(best['dff'])
mlp_size = int(best['mlp_size']) # size of the first MLP layer
dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

# instantiating model
model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

# moving the model to the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


criterion = MSELoss() # L1Loss()
optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

# declaring trainer object
model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=20,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close)
# training
train_losses, val_losses = model_trainer.train_loop(is_plot=True, is_plot_and_plot_test=True)
# testing
mse, mae = model_trainer.test_model()


Top 2 features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures = top_feattures[:2]
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

# instantiating model (after hyperparameter tuning)

num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
num_layers = int(best["num_layers"])
num_heads = int(best["num_heads"])
d_model = int(best['d_model_by_num_heads']) * num_heads
dff = int(best['dff'])
mlp_size = int(best['mlp_size']) # size of the first MLP layer
dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

# instantiating model
model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

# moving the model to the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


criterion = MSELoss() # L1Loss()
optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

# declaring trainer object
model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=20,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close)
# training
train_losses, val_losses = model_trainer.train_loop(is_plot=True, is_plot_and_plot_test=True)
# testing
mse, mae = model_trainer.test_model()


Top 5 Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures = top_feattures[:5]
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

# instantiating model (after hyperparameter tuning)

num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
num_layers = int(best["num_layers"])
num_heads = int(best["num_heads"])
d_model = int(best['d_model_by_num_heads']) * num_heads
dff = int(best['dff'])
mlp_size = int(best['mlp_size']) # size of the first MLP layer
dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

# instantiating model
model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

# moving the model to the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


criterion = MSELoss() # L1Loss()
optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

# declaring trainer object
model_trainer = Trainer(model=model,
                train_dataloader=train_dataloader,
                val_dataloader=val_dataloader,
                test_dataloader=test_dataloader,
                criterion=criterion,
                optimiser=optimiser,
                scheduler=scheduler,
                device=device,
                num_epochs=20,
                early_stopping_patience_limit=10,
                is_save_model=True,
                scaler=scaler_close)
# training
train_losses, val_losses = model_trainer.train_loop(is_plot=True, is_plot_and_plot_test=True)
# testing
mse, mae = model_trainer.test_model()


Top 10 Features

In [None]:
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
top_feattures.append("close")
df = df[top_feattures]
input_seq_len_ = 36
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

# instantiating model (after hyperparameter tuning)

num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
num_layers = int(best["num_layers"])
num_heads = int(best["num_heads"])
d_model = int(best['d_model_by_num_heads']) * num_heads
dff = int(best['dff'])
mlp_size = int(best['mlp_size']) # size of the first MLP layer
dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

# instantiating model
model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

# moving the model to the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


criterion = MSELoss() # L1Loss()
optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

# declaring trainer object
model_trainer = Trainer(model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            test_dataloader=test_dataloader,
            criterion=criterion,
            optimiser=optimiser,
            scheduler=scheduler,
            device=device,
            num_epochs=20,
            early_stopping_patience_limit=10,
            is_save_model=True,
            scaler=scaler_close)
# training
train_losses, val_losses = model_trainer.train_loop(is_plot=True, is_plot_and_plot_test=True)
# testing
mse, mae = model_trainer.test_model()


# Vanilla Transformer (Next 24th Step Prediction | Encoder only)

In [None]:
# data preprocessing
# training
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
# top_feattures = top_feattures[:5]
# top_feattures.append("close")
df = df[["close"]]
# df = df[top_feattures]
input_seq_len_ = 168
output_seq_len_ = 24
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

# instantiating model (after hyperparameter tuning)

num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
num_layers = int(best["num_layers"])
num_heads = int(best["num_heads"])
d_model = int(best['d_model_by_num_heads']) * num_heads
dff = int(best['dff'])
mlp_size = int(best['mlp_size']) # size of the first MLP layer
dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

# instantiating model
model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

# moving the model to the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


criterion = MSELoss() # L1Loss()
optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

# declaring trainer object
model_trainer = Trainer(model=model,
                  train_dataloader=train_dataloader,
                  val_dataloader=val_dataloader,
                  test_dataloader=test_dataloader,
                  criterion=criterion,
                  optimiser=optimiser,
                  scheduler=scheduler,
                  device=device,
                  num_epochs=20,
                  early_stopping_patience_limit=10,
                  is_save_model=True,
                  scaler=scaler_close)

# training
train_losses, val_losses = model_trainer.train_loop(is_plot=True, is_plot_and_plot_test=True)
# testing
mse, mae = model_trainer.test_model()



In [None]:
# delete this afterwards
# data preprocessing
# training
df = pd.read_csv("../data/Final_data_hourly.csv")
with open('../data/top_feattures.pkl', 'rb') as file:
    top_feattures = pickle.load(file)
# top_feattures = top_feattures[:5]
# top_feattures.append("close")
df = df[["close"]]
# df = df[top_feattures]
input_seq_len_ = 168
output_seq_len_ = 1
scaled_df, scaler_general, scaler_close, train_dataloader, val_dataloader, test_dataloader = preprocess_data(df,
                                                                                                             batch_size = 256,
                                                                                                             input_seq_len=input_seq_len_,
                                                                                                             output_seq_len=output_seq_len_,
                                                                                                             output_as_seq=False)


with open('../data/best_on_hyperparam_for_top_Feature_Combo_vanilla_transformer_hourly.pkl', 'rb') as file:
    best = pickle.load(file)

# instantiating model (after hyperparameter tuning)

num_features = int(len(scaled_df.columns)) # a.k.a, number of cols in df
num_layers = int(best["num_layers"])
num_heads = int(best["num_heads"])
d_model = int(best['d_model_by_num_heads']) * num_heads
dff = int(best['dff'])
mlp_size = int(best['mlp_size']) # size of the first MLP layer
dropout_rate = round(best['dropout_rate'], 3)  # dropout rate for the Transformer layers
mlp_dropout_rate = round(best['mlp_dropout_rate'], 3) # dropout rate for the MLP layers

# instantiating model
model = VanillaTimeSeriesTransformer_EncoderOnly(num_features, num_layers, d_model, num_heads, dff, input_seq_len_,
                                output_seq_len_, mlp_size, dropout_rate, mlp_dropout_rate)

# moving the model to the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = MSELoss() # L1Loss()
optimiser = Adam(model.parameters(), lr=round(best['lr'], 6))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'min', factor=0.9, patience=5)

# declaring trainer object
model_trainer = Trainer(model=model,
                  train_dataloader=train_dataloader,
                  val_dataloader=val_dataloader,
                  test_dataloader=test_dataloader,
                  criterion=criterion,
                  optimiser=optimiser,
                  scheduler=scheduler,
                  device=device,
                  num_epochs=50,
                  early_stopping_patience_limit=5,
                  is_save_model=True,
                  scaler=scaler_close)

# training
train_losses, val_losses = model_trainer.train_loop(is_plot=True, is_plot_and_plot_test=True)
# testing
mse, mae = model_trainer.test_model()

