### **DeepLOB-Attention**

In [None]:
# load packages

import numpy as np
from datetime import datetime
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import torch
from torch.utils import data
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
from tensorflow import keras

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### **Data**
The dataset in the folder Dataset is the FI-2010 dataset zipped and normalized.

As in the original paper I used the firs 7 days to train and to validate (80% / 20%), and the rest 3 days to do the the testing.

In [None]:
# please change the data_path to your local path and unzip the file

dec_data = np.loadtxt('/published/BenchmarkDatasets/BenchmarkDatasets/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_7.txt')
dec_train = dec_data[:, :int(dec_data.shape[1] * 0.8)]
dec_val = dec_data[:, int(dec_data.shape[1] * 0.8):]

dec_test1 = np.loadtxt('/published/BenchmarkDatasets/BenchmarkDatasets/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Testing/Test_Dst_NoAuction_ZScore_CF_7.txt')
dec_test2 = np.loadtxt('/published/BenchmarkDatasets/BenchmarkDatasets/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Testing/Test_Dst_NoAuction_ZScore_CF_8.txt')
dec_test3 = np.loadtxt('/published/BenchmarkDatasets/BenchmarkDatasets/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Testing/Test_Dst_NoAuction_ZScore_CF_9.txt')
dec_test = np.hstack((dec_test1, dec_test2, dec_test3))

horizon = 5        

y_train = dec_train[-horizon:, :].T
y_val = dec_val[-horizon:, :].T
y_test = dec_test[-horizon:, :].T

y_train = y_train[49:] - 1
y_val = y_val[49:] - 1
y_test = y_test[49:] - 1 

dec_train = dec_train[:40, :].T
dec_val = dec_val[:40, :].T
dec_test = dec_test[:40, :].T

In [None]:
def get_label(data):
  
    all_label = []
    
    for i in range(data.shape[1]):
        one_label = data[:, i]
        one_label = keras.utils.to_categorical(one_label, 3)
        one_label = one_label.reshape(len(one_label), 1, 3)
        all_label.append(one_label)

    return np.hstack(all_label)


class Dataset(data.Dataset):
    """Characterizes a dataset for PyTorch"""
    def __init__(self, x, y, decoder_input ,num_classes, dim):
        """Initialization""" 
        self.num_classes = num_classes
        self.dim = dim
        self.x = x   
        self.y = get_label(y)
        self.decoder_input = decoder_input
        self.length = x.shape[0] - T -self.dim + 1
        print(self.y.shape)

        x = torch.from_numpy(x)
        self.x = torch.unsqueeze(x, 1)
        self.y = torch.from_numpy(self.y).float()

    def __len__(self):
        """Denotes the total number of samples"""
        return self.length

    def __getitem__(self, i):
        input = self.x[i:i+self.dim, :]
        input = input.permute(1, 0, 2)
        
        return input, self.y[i], self.decoder_input[i]

In [None]:
#Hyperparameters

batch_size = 32
epochs = 50
T = 10      
num_classes = 3
dim = 50

decoder_input_train = torch.zeros(dec_train.shape[0] - dim + 1, 1, 3)
decoder_input_train[:, 0, 0] = 1

decoder_input_val = torch.zeros(dec_val.shape[0] - dim + 1, 1, 3)
decoder_input_val[:, 0, 0] = 1

decoder_input_test = torch.zeros(dec_test.shape[0] - dim + 1, 1, 3)
decoder_input_test[:, 0, 0] = 1

dataset_val = Dataset(dec_val, y_val, decoder_input_val, num_classes, dim)
dataset_test = Dataset(dec_test, y_test, decoder_input_test, num_classes, dim)
dataset_train = Dataset(dec_train, y_train, decoder_input_train, num_classes, dim)

train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=dataset_val, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False)

(50901, 5, 3)
(139538, 5, 3)
(203751, 5, 3)


### **Model Architecture**
The Architecture is described in the paper

In [None]:
class DeepLOB_Attention(nn.Module):
  def __init__(self):
    super().__init__()

    # convolution blocks
    self.conv1 = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1,2), stride=(1,2)),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
    )
    self.conv2 = nn.Sequential(
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,2), stride=(1,2)),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
    )
    self.conv3 = nn.Sequential(
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,10)),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
    )
    
    # inception moduels
    self.inp1 = nn.Sequential(
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
    )
    self.inp2 = nn.Sequential(
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(5,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
    )
    self.inp3 = nn.Sequential(
        nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
        nn.LeakyReLU(negative_slope=0.01),
    )
    
    # lstm layers
    self.lstm = nn.LSTM(input_size=192, hidden_size=64, num_layers=1, batch_first=True)
    
    self.decoder_lstm = nn.LSTM(input_size=67, hidden_size=64, num_layers=1, batch_first=True)
    self.fc1 = nn.Linear(128, 3)
    self.BN = nn.BatchNorm1d(1, momentum=0.6)

  def forward(self, x, decoder_inputs):  

    h0 = torch.zeros(1, x.size(0), 64).to(device)
    c0 = torch.zeros(1, x.size(0), 64).to(device)
    
    x = self.conv1(x)
    
    x = self.conv2(x)

    x = self.conv3(x)

    x_inp1 = self.inp1(x)
    x_inp2 = self.inp2(x)
    x_inp3 = self.inp3(x) 

    x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
   
    x = x.permute(0, 2, 1, 3)
    
    x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))
    
    encoder_outputs, (h0, c0) = self.lstm(x, (h0, c0))
       
    states = (h0, c0)
     
    encoder_state_h = h0.permute(1, 0, 2)
    
    inputs = torch.concatenate([decoder_inputs, encoder_state_h], axis=2)

    all_outputs = torch.zeros(5, x.shape[0], 3).to(device)

    #we iterate for every horizon (10, 20, 30, 50, 100)
    for i in range(5):

      #we pass in input to the decoder the context vector, the last decoder's output and the last decoder's hidden state 
      output, (state_h, state_c) = self.decoder_lstm(inputs, states)
      
      #computing the attention for the next time step
      attention = torch.bmm(output, encoder_outputs.permute(0, 2, 1))
      attention = torch.softmax(attention, dim=2)

      #computing the context vector 
      c_v = torch.bmm(attention, encoder_outputs)
      c_v = self.BN(c_v)

      #creating the input to compute the distribution for the output (3)
      last_inputs = torch.concatenate([c_v, output],  axis=2)
     
      #computing the distribution for the output with the context vector (encoder_outputs) and the decoder's output
      output = self.fc1(last_inputs)
      output = torch.softmax(output, dim=2)

      all_outputs[i] = torch.squeeze(output)
      inputs = torch.concatenate([output, c_v], axis=2)
      states = [state_h, state_c]
    
    all_outputs = torch.permute(all_outputs, (1, 2, 0))
    return all_outputs

### **Model Training**

In [None]:
model = DeepLOB_Attention()
model.to(device)
lr = 0.001
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr, eps=1e-07)

def batch_gd(model, criterion, optimizer, epochs):
    
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    best_test_loss = np.inf
    best_test_epoch = 0
    i = 0
    cont = 0
    for it in tqdm(range(epochs)):
        
        model.train()
        t0 = datetime.now()
        train_loss = []
        for inputs, targets, decoder_inputs in train_loader:
            # move data to GPU
            targets = torch.permute(targets, (0, 2, 1))
            inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device)
            decoder_inputs = decoder_inputs.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs, decoder_inputs)
            loss = criterion(outputs, targets)

            # Backward and optimize
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        # Get train loss and test loss
        train_loss = np.mean(train_loss)
    
        model.eval()
        test_loss = []
        for inputs, targets, decoder_inputs in val_loader:
            targets = torch.permute(targets, (0, 2, 1))
            inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device)    
            decoder_inputs = decoder_inputs.to(device)  
            outputs = model(inputs, decoder_inputs)
            loss = criterion(outputs, targets)
            test_loss.append(loss.item())
        test_loss = np.mean(test_loss)

        # Save losses
        train_losses[it] = train_loss
        test_losses[it] = test_loss

        #We save the best model
        if test_loss < best_test_loss:
            torch.save(model, '/best_model_Attention')
            best_test_loss = test_loss
            best_test_epoch = it
            print('model saved')

        dt = datetime.now() - t0
        print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
          Validation Loss: {test_loss:.4f}, Duration: {dt}, Best Val Epoch: {best_test_epoch}')
        
    return train_losses, test_losses

In [None]:
print("------- List Hyper Parameters -------")
print("epochs   ->   " + str(epochs))
print("batch size   ->    " + str(batch_size))
print("Optimizer   ->    " + str(optimizer))

train_losses, val_losses = batch_gd(model, criterion, optimizer, epochs)

plt.figure(figsize=(15,6))
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='validation loss')
plt.legend()

### **Model Testing**

In [None]:
model = torch.load('/best_model_Attention')

n_correct = 0.
n_total = 0.
all_targets = [[], [], [], [], []]
all_predictions = [[], [], [], [], []]
cont = 0
for inputs, targets, decoder_inputs in test_loader:
    # Move to GPU
    targets = torch.permute(targets, (0, 2, 1))
    inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device)    
    decoder_inputs = decoder_inputs.to(device)
    cont += 1

    # Forward pass
    outputs = model(inputs, decoder_inputs)

    #Get prediction
    _, predictions = torch.max(outputs, 1)
    _, targets = torch.max(targets, 1)
    for i in range(5):
      prediction = predictions[:, i]
      target = targets[:, i]

      if (cont == 1):
        all_targets[i] = target.cpu().numpy()
        all_predictions[i] = prediction.cpu().numpy()
        
      else:
        all_targets[i] = np.concatenate((all_targets[i], target.cpu().numpy()))
        all_predictions[i] = np.concatenate((all_predictions[i], prediction.cpu().numpy()))

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

for i in range(5):
  print(classification_report(all_targets[i], all_predictions[i], digits=4))
  c = confusion_matrix(all_targets[i], all_predictions[i], normalize="true")
  disp = ConfusionMatrixDisplay(c)
  disp.plot()
  plt.show()