In [None]:
%pip install pandas numpy torch tqdm seaborn matplotlib



In [None]:
import sys
sys.path.append('..')

import torch
import os
import torch.nn as nn
import pandas as pd 
import numpy as np
from torch.utils.data import random_split, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from LOBnet.Models import DeepLOB_Network_v1
from LOBnet.Orderbooks import FI2010_Orderbook_Optimized

In [None]:
datapath = "../data/BenchmarkDatasets_csv"
window_size = 100
overlap_percent = 25  # 25% overlap between windows
num_workers = 6


In [None]:
# initialize dataset
dataset = FI2010_Orderbook_Optimized(
    root_path=datapath,
    venue="NoAuction",
    normalization_type="Zscore",
    split="Training",
    window_size=window_size,
    overlap_percent=overlap_percent,
    num_workers=num_workers, 
    verbose=True
)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux

Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_4.csvProcessing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_1.csv

Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_7.csv
Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_6.csv
Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_2.csv
Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_3.csv


Processing CSV Files:  11%|█         | 1/9 [00:09<01:17,  9.73s/it]

Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_5.csv


Processing CSV Files:  22%|██▏       | 2/9 [00:24<01:29, 12.82s/it]

Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_8.csv


Processing CSV Files:  33%|███▎      | 3/9 [00:33<01:05, 10.97s/it]

Processing file: ../data/BenchmarkDatasets_csv/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_9.csv


Processing CSV Files: 100%|██████████| 9/9 [11:14<00:00, 74.97s/it] 



Raw price movement statistics:
Mean: 1.9929
Std: 0.8862
Min: 1.0000
Max: 3.0000

Label distribution:
0    8931
1    4838
2    8771
Name: count, dtype: int64


In [None]:
# create train/test splits (80/20)
total_size = len(dataset)
train_size = int(0.8 * total_size)
test_size = total_size - train_size

train_dataset, test_dataset = random_split(
    dataset, 
    [train_size, test_size],
    generator=torch.Generator().manual_seed(42)  # for reproducibility
)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False,
    collate_fn=lambda batch: (
        torch.stack([x[0] for x in batch]).transpose(1, 2).unsqueeze(1),
        torch.stack([x[1] for x in batch])
    )
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False,
    collate_fn=lambda batch: (
        torch.stack([x[0] for x in batch]).transpose(1, 2).unsqueeze(1),
        torch.stack([x[1] for x in batch])
    )   
)

print(f"Total dataset size: {total_size}")
print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Total dataset size: 22540
Training set size: 18032
Test set size: 4508


In [None]:
# look at first batch
for batch in train_loader:
    batch_X, batch_y = batch  # unpack batch tuple
    print(f"Batch X shape: {batch_X.shape}")  # should be (batch_size, 100, 40)
    print(f"Batch y shape: {batch_y.shape}")
    break

Batch X shape: torch.Size([32, 1, 100, 40])
Batch y shape: torch.Size([32, 3])


In [None]:
from LOBnet.Models import DeepLOB_Network_v2

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DeepLOB_Network_v2(y_len=3, device=device).to(device) 
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training parameters
num_epochs = 25
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    
    for batch_X, batch_y in pbar:
        batch_X = batch_X.float().to(device)
        batch_y = batch_y.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(batch_y.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        running_loss += loss.item()
        
        pbar.set_postfix({
            'loss': f'{running_loss/len(pbar):.4f}',
            'acc': f'{100 * correct/total:.2f}%'
        })
    
    # Epoch statistics
    epoch_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch+1} Loss: {epoch_loss:.4f}')
    
    # Save best model
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss,
        }, 'best_model_1.pth')

print('Training finished!')

Epoch 1/25:   0%|          | 0/564 [00:01<?, ?it/s]


AttributeError: 'tuple' object has no attribute 'dim'

In [None]:
def evaluate_model(model, test_loader, device):
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            
            # convert one-hot encoded outputs to class predictions
            predictions = outputs.argmax(dim=1)
            true_labels = batch_y.argmax(dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(true_labels.cpu().numpy())
    
    return all_predictions, all_labels

In [None]:
# load the saved model state if it exists
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = 'best_model_1.pth'
if os.path.exists(model_path):
    print(f"Loading saved model from {model_path}")
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # set to evaluation mode
else:
    print(f"No saved model found at {model_path}. Please train the model first.")
    exit()

predictions, true_labels = evaluate_model(model, test_loader, device)

print("\nClassification Report:")
print(classification_report(true_labels, predictions, 
                          target_names=['No Movement', 'Up', 'Down']))

plt.figure(figsize=(10, 8))
cm = confusion_matrix(true_labels, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Movement', 'Up', 'Down'],
            yticklabels=['No Movement', 'Up', 'Down'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

NameError: name 'torch' is not defined