In [44]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from tqdm.auto import tqdm

from sklearn.preprocessing import OneHotEncoder

In [45]:
datapath = "../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training"
csv_files = [f for f in os.listdir(datapath) if f.endswith('.csv')]

In [46]:
def split_into_windows(df, window_size=100):
    """
    Split a Pandas DataFrame into non-overlapping windows.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame to be split.
    window_size (int): The size of each window (number of columns).
    
    Returns:
    list of pandas.DataFrame: A list of DataFrame windows.
    """
    num_windows = df.shape[1] // window_size
    windows = []
    for i in range(num_windows):
        start = i * window_size
        end = start + window_size
        windows.append(df.iloc[:, start:end])
    return windows

In [47]:
X_train_lst = []
Y_train_lst = []
for i,f in enumerate(csv_files):
    print(f"Working on file {i}")
    df = pd.read_csv(os.path.join(datapath, f), header=None)
    X_train_lst += split_into_windows(df.iloc[0:40,:], window_size=100)
    Y_train_lst += split_into_windows(df.iloc[-5:,:]) 

Working on file 0
Working on file 1
Working on file 2
Working on file 3
Working on file 4
Working on file 5
Working on file 6
Working on file 7
Working on file 8


In [48]:
X_train = np.array(X_train_lst).swapaxes(1,2)

# Naively ignore all information about the orderbook in order to get a baseline. Here, we are looking at the deviation from the end of the 
# window to a point 10 events into the future. See pg 13 of https://arxiv.org/pdf/1705.03233
Y_train = np.array(Y_train_lst)[:,-1,-1] 
    

In [49]:
encoder = OneHotEncoder(sparse_output=False)
Y_train = encoder.fit_transform(Y_train.reshape(-1,1))

Y_train

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [50]:
print(X_train.shape)
print(Y_train.shape)

(16901, 100, 40)
(16901, 3)


In [54]:
class NaiveLOBCNN(nn.Module):
    def __init__(self):
        super(NaiveLOBCNN, self).__init__()

        # Conv block see page 5 of https://arxiv.org/pdf/1808.03668
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=16,kernel_size=(1, 2),stride=2)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=16,out_channels=32,kernel_size=(1, 2),stride=2)
        self.relu2 = nn.ReLU()
        
        self.flatten_size = 32 * 10 * 25
        
        # FC 
        self.fc1 = nn.Linear(self.flatten_size, 64)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(64, 3)  # 3 classes
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Add channel dimension if not present
        if len(x.shape) == 3:
            x = x.unsqueeze(1)
            
        # Convolution blocks
        x = self.relu1(self.conv1(x))
        x = self.relu2(self.conv2(x))
        
        # Flatten
        x = x.view(-1, self.flatten_size)
        
        # Fully connected layers
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        
        return x

In [55]:
class OrderBookDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = OrderBookDataset(X_train, Y_train)

batch_size = 32
dataloader = DataLoader(
    dataset, 
    batch_size=batch_size,
    shuffle=True,
    num_workers=0  # Increase if you need parallel loading
)

In [56]:
model = NaiveLOBCNN()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')
    
    for batch_X, batch_y in pbar:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(batch_y.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        running_loss += loss.item()
        
        pbar.set_postfix({
            'loss': f'{running_loss/len(pbar):.4f}',
            'acc': f'{100 * correct/total:.2f}%'
        })
    
    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    print(f'\nEpoch {epoch+1} Summary:')
    print(f'Average Loss: {epoch_loss:.4f}')
    print(f'Accuracy: {epoch_acc:.2f}%\n')

Epoch 1/10: 100%|██████████| 529/529 [00:02<00:00, 202.21it/s, loss=1.0579, acc=42.18%]



Epoch 1 Summary:
Average Loss: 1.0579
Accuracy: 42.18%



Epoch 2/10: 100%|██████████| 529/529 [00:02<00:00, 208.05it/s, loss=1.0228, acc=48.82%]



Epoch 2 Summary:
Average Loss: 1.0228
Accuracy: 48.82%



Epoch 3/10: 100%|██████████| 529/529 [00:02<00:00, 207.18it/s, loss=0.9916, acc=53.06%]



Epoch 3 Summary:
Average Loss: 0.9916
Accuracy: 53.06%



Epoch 4/10: 100%|██████████| 529/529 [00:02<00:00, 206.16it/s, loss=0.9633, acc=56.76%]



Epoch 4 Summary:
Average Loss: 0.9633
Accuracy: 56.76%



Epoch 5/10: 100%|██████████| 529/529 [00:02<00:00, 206.39it/s, loss=0.9338, acc=60.22%]



Epoch 5 Summary:
Average Loss: 0.9338
Accuracy: 60.22%



Epoch 6/10: 100%|██████████| 529/529 [00:02<00:00, 205.52it/s, loss=0.9054, acc=63.51%]



Epoch 6 Summary:
Average Loss: 0.9054
Accuracy: 63.51%



Epoch 7/10: 100%|██████████| 529/529 [00:02<00:00, 202.44it/s, loss=0.8820, acc=66.32%]



Epoch 7 Summary:
Average Loss: 0.8820
Accuracy: 66.32%



Epoch 8/10: 100%|██████████| 529/529 [00:02<00:00, 179.81it/s, loss=0.8625, acc=68.19%]



Epoch 8 Summary:
Average Loss: 0.8625
Accuracy: 68.19%



Epoch 9/10: 100%|██████████| 529/529 [00:02<00:00, 199.72it/s, loss=0.8473, acc=69.97%]



Epoch 9 Summary:
Average Loss: 0.8473
Accuracy: 69.97%



Epoch 10/10: 100%|██████████| 529/529 [00:02<00:00, 195.32it/s, loss=0.8311, acc=71.71%]


Epoch 10 Summary:
Average Loss: 0.8311
Accuracy: 71.71%




