In [1]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from tqdm.auto import tqdm

from sklearn.preprocessing import OneHotEncoder
import matplotlib as plt

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
datapath = "/Users/tommydenezza/Desktop/DeepLearning/LOBnet/data/BenchmarkDatasets_csv/BenchmarkDatasets/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training"
csv_files = [f for f in os.listdir(datapath) if f.endswith('.csv')]

In [5]:
def split_into_windows(df, window_size=100):
    """
    Split a Pandas DataFrame into non-overlapping windows.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame to be split.
    window_size (int): The size of each window (number of columns).
    
    Returns:
    list of pandas.DataFrame: A list of DataFrame windows.
    """
    num_windows = df.shape[1] // window_size
    windows = []
    for i in range(num_windows):
        start = i * window_size
        end = start + window_size
        windows.append(df.iloc[:, start:end])
    return windows

In [6]:
X_train_lst = []
Y_train_lst = []
for i,f in enumerate(csv_files):
    print(f"Working on file {i}")
    df = pd.read_csv(os.path.join(datapath, f), header=None)
    X_train_lst += split_into_windows(df.iloc[0:40,:], window_size=100)
    Y_train_lst += split_into_windows(df.iloc[-5:,:]) 

Working on file 0
Working on file 1
Working on file 2
Working on file 3
Working on file 4
Working on file 5
Working on file 6
Working on file 7
Working on file 8


In [9]:
# X_train = np.array(X_train_lst).swapaxes(1,2)

# # Naively ignore all information about the orderbook in order to get a baseline. Here, we are looking at the deviation from the end of the 
# # window to a point 10 events into the future. See pg 13 of https://arxiv.org/pdf/1705.03233
# Y_train = np.array(Y_train_lst)[:,-1,-1] 

# encoder = OneHotEncoder(sparse_output=False)
# Y_train = encoder.fit_transform(Y_train.reshape(-1,1)) 

X_data = np.array(X_train_lst).swapaxes(1,2)

# Naively ignore all information about the orderbook in order to get a baseline. Here, we are looking at the deviation from the end of the 
# window to a point 10 events into the future. See pg 13 of https://arxiv.org/pdf/1705.03233
Y_data = np.array(Y_train_lst)[:,-1,-1] 

encoder = OneHotEncoder(sparse_output=False)
Y_data = encoder.fit_transform(Y_data.reshape(-1,1)) 

split_point = int(0.75*len(X_data))
X_train, X_test = X_data[split_point:], X_data[:split_point]
Y_train, Y_test = Y_data[split_point:], Y_data[:split_point]


In [20]:
class NaiveLOBCNN(nn.Module):
    def __init__(self):
        super(NaiveLOBCNN, self).__init__()

        # Conv block see page 5 of https://arxiv.org/pdf/1808.03668
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=16,kernel_size=(1, 2),stride=2)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=16,out_channels=32,kernel_size=(1, 2),stride=2)
        self.relu2 = nn.ReLU()
        
        self.dropout1 = nn.Dropout2d(p=0.3) # Should have a conversation about what our desired dropout is, set to 3/10 rn

        self.flatten_size = 32 * 10 * 25
        
        # FC 
        self.fc1 = nn.Linear(self.flatten_size, 64)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(64, 3)  # 3 classes
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Add channel dimension if not present
        if len(x.shape) == 3:
            x = x.unsqueeze(1)
            
        # Convolution blocks
        x = self.relu1(self.conv1(x))
        x = self.relu2(self.conv2(x))
        
        # Flatten
        x = x.view(-1, self.flatten_size)
        
        # Fully connected layers
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        
        return x


In [11]:
class OrderBookDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OrderBookDataset(X_train, Y_train)
test_dataset = OrderBookDataset(X_test, Y_test)

batch_size = 32
dataloader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=False,
    num_workers=0  # Increase if you need parallel loading
)

In [13]:
model = NaiveLOBCNN()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')
    
    for batch_X, batch_y in pbar:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(batch_y.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        running_loss += loss.item()
        
        pbar.set_postfix({
            'loss': f'{running_loss/len(pbar):.4f}',
            'acc': f'{100 * correct/total:.2f}%'
        })
    
    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    print(f'\nEpoch {epoch+1} Summary:')
    print(f'Average Loss: {epoch_loss:.4f}')
    print(f'Accuracy: {epoch_acc:.2f}%\n')

Epoch 1/10: 100%|██████████| 133/133 [00:01<00:00, 103.74it/s, loss=1.0907, acc=38.41%]



Epoch 1 Summary:
Average Loss: 1.0907
Accuracy: 38.41%



Epoch 2/10: 100%|██████████| 133/133 [00:01<00:00, 104.93it/s, loss=1.0771, acc=42.36%]



Epoch 2 Summary:
Average Loss: 1.0771
Accuracy: 42.36%



Epoch 3/10: 100%|██████████| 133/133 [00:01<00:00, 101.59it/s, loss=1.0511, acc=43.33%]



Epoch 3 Summary:
Average Loss: 1.0511
Accuracy: 43.33%



Epoch 4/10: 100%|██████████| 133/133 [00:01<00:00, 104.28it/s, loss=1.0371, acc=46.78%]



Epoch 4 Summary:
Average Loss: 1.0371
Accuracy: 46.78%



Epoch 5/10: 100%|██████████| 133/133 [00:01<00:00, 109.33it/s, loss=1.0229, acc=47.96%]



Epoch 5 Summary:
Average Loss: 1.0229
Accuracy: 47.96%



Epoch 6/10: 100%|██████████| 133/133 [00:01<00:00, 110.76it/s, loss=1.0087, acc=50.05%]



Epoch 6 Summary:
Average Loss: 1.0087
Accuracy: 50.05%



Epoch 7/10: 100%|██████████| 133/133 [00:01<00:00, 110.95it/s, loss=0.9946, acc=52.39%]



Epoch 7 Summary:
Average Loss: 0.9946
Accuracy: 52.39%



Epoch 8/10: 100%|██████████| 133/133 [00:01<00:00, 111.87it/s, loss=0.9930, acc=52.22%]



Epoch 8 Summary:
Average Loss: 0.9930
Accuracy: 52.22%



Epoch 9/10: 100%|██████████| 133/133 [00:01<00:00, 103.08it/s, loss=1.0050, acc=52.08%]



Epoch 9 Summary:
Average Loss: 1.0050
Accuracy: 52.08%



Epoch 10/10: 100%|██████████| 133/133 [00:01<00:00, 109.91it/s, loss=0.9689, acc=56.37%]


Epoch 10 Summary:
Average Loss: 0.9689
Accuracy: 56.37%






In [16]:
test_dataloader = DataLoader(
    test_dataset, 
    batch_size=batch_size,
    shuffle=False,
    num_workers=0  # Increase if needed for parallel data loading
)

model.eval()
pred_batches = tqdm(test_dataloader, desc='Running Test Predictions')

all_predictions = []
all_labels = []

for X_batch, y_batch in pred_batches:
    
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)
    
    test_output = model(X_batch)
    _, test_pred = torch.max(test_output.data, 1)  # Predicted class indices
    
    if y_batch.ndim > 1:
        _, test_labels = torch.max(y_batch.data, 1)
    else:
        test_labels = y_batch 
    
    all_predictions.extend(test_pred.cpu().numpy())
    all_labels.extend(test_labels.cpu().numpy())

all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

accuracy = (all_predictions == all_labels).mean() * 100
print(f"Test Accuracy: {accuracy:.2f}%")



Running Test Predictions: 100%|██████████| 397/397 [00:00<00:00, 546.68it/s]


Test Accuracy: 40.74%


In [18]:
print(all_predictions)
print(all_labels)

[0 2 0 ... 1 1 1]
[0 2 2 ... 0 2 1]
