<a href="https://colab.research.google.com/github/MassilGG/deep-lob-gan/blob/main/DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE: cuda


In [3]:
!git clone https://github.com/MassilGG/deep-lob-gan

Cloning into 'deep-lob-gan'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (8/8), 2.41 MiB | 4.94 MiB/s, done.
Resolving deltas: 100% (1/1), done.


1) Load LOBSTER files

In [2]:
name = "AMZN_2012-06-21_34200000_57600000"
book_path = f"/content/{name}_orderbook_5.csv"   # <-- change
msg_path  = f"/content/{name}_message_5.csv"     # <-- change (recommended). If missing, set msg_path=None

LEVELS = 5  # you said 5 levels
DT_SEC = 10 # paper uses 10s; with 1 day only, consider 1-5s if too few samples
DROP_OPEN_MIN = 30
DROP_CLOSE_MIN = 30


def read_lobster_book(path, levels=5, sep=","):
    """
    Reads LOBSTER orderbook file without header.
    Common format (per level): ask_price_i, ask_size_i, bid_price_i, bid_size_i, repeated for i=1..levels
    => total columns = 4*levels
    """
    df = pd.read_csv(path, header=None, sep=sep)
    expected = 4 * levels
    if df.shape[1] != expected:
        raise ValueError(f"Expected {expected} columns for {levels} levels, got {df.shape[1]}. "
                         "Check sep delimiter or file format.")
    return df

def read_lobster_message(path, sep=","):
    """
    Typical LOBSTER message columns:
    0: time (seconds after midnight, float)
    1: event type
    2: order id
    3: size
    4: price
    5: direction
    """
    df = pd.read_csv(path, header=None, sep=sep)
    if df.shape[1] < 1:
        raise ValueError("Message file seems empty or badly parsed.")
    return df

# Try comma first, fallback to space
def smart_read_book(path, levels=5):
    for sep in [",", r"\s+"]:
        try:
            return read_lobster_book(path, levels=levels, sep=sep)
        except Exception as e:
            last = e
    raise last

def smart_read_msg(path):
    for sep in [",", r"\s+"]:
        try:
            return read_lobster_message(path, sep=sep)
        except Exception as e:
            last = e
    raise last

msg_df = smart_read_msg(msg_path)
time_sec = msg_df.iloc[:, 0].astype(float).values
book_df = smart_read_book(book_path, levels=LEVELS)
print("book_df shape:", book_df.shape)
print("time range (sec):", float(time_sec[0]), "->", float(time_sec[-1]))


FileNotFoundError: [Errno 2] No such file or directory: '/content/AMZN_2012-06-21_34200000_57600000_message_5.csv'

2) Resample to regular Δt grid

In [None]:
def resample_book_last_in_bin(book_df, time_sec, dt_sec=10):
    """
    Creates a regular time grid and keeps the last snapshot observed in each bin.
    """
    t0, t1 = float(time_sec[0]), float(time_sec[-1])
    grid = np.arange(t0, t1 + 1e-9, dt_sec)
    idx = np.searchsorted(time_sec, grid, side="right") - 1
    idx[idx < 0] = 0
    idx[idx >= len(book_df)] = len(book_df) - 1
    resampled = book_df.iloc[idx].reset_index(drop=True)
    return resampled, grid


book_res, grid_sec = resample_book_last_in_bin(book_df, time_sec, dt_sec=DT_SEC)

# Drop first/last 30 minutes of continuous trading session (paper-style)
# Here we do it purely in clock time:
open_cut = DROP_OPEN_MIN * 60
close_cut = DROP_CLOSE_MIN * 60
mask = (grid_sec >= grid_sec[0] + open_cut) & (grid_sec <= grid_sec[-1] - close_cut)
book_res = book_res.loc[mask].reset_index(drop=True)
grid_sec = grid_sec[mask]

print("Resampled shape:", book_res.shape)

3) Build "centered snapshot" vector X_t of length 2k (k=LEVELS) using a price grid fixed at time t:
- grid_prices_t = [bidP_k..bidP1, askP1..askP_k]
- X_t = [ -bidS_k..-bidS1, +askS1..+askS_k ]
- X_{t+dt} is computed on the SAME grid_prices_t by looking up sizes at t+dt (missing => 0).


In [None]:
def parse_level_columns(row, levels=5):
    """
    Returns arrays (askP, askS, bidP, bidS) each length=levels.
    Handles the common interleaved format:
      [askP1, askS1, bidP1, bidS1, askP2, askS2, bidP2, bidS2, ...]
    If your file is different, adapt here.
    """
    arr = row.values.astype(float)
    askP, askS, bidP, bidS = [], [], [], []
    for i in range(levels):
        base = 4*i
        askP.append(arr[base + 0])
        askS.append(arr[base + 1])
        bidP.append(arr[base + 2])
        bidS.append(arr[base + 3])
    return np.array(askP), np.array(askS), np.array(bidP), np.array(bidS)

def snapshot_on_own_grid(row, levels=5):
    askP, askS, bidP, bidS = parse_level_columns(row, levels=levels)
    # grid prices: bids from deep->best, then asks best->deep
    grid_prices = np.concatenate([bidP[::-1], askP])
    # signed sizes aligned with that grid
    x = np.concatenate([-bidS[::-1], +askS])
    return grid_prices, x

def snapshot_on_given_grid(row, grid_prices, levels=5):
    askP, askS, bidP, bidS = parse_level_columns(row, levels=levels)
    # map price -> signed size (only top levels available)
    mp = {}
    for i in range(levels):
        mp[float(bidP[i])] = -float(bidS[i])
        mp[float(askP[i])] = +float(askS[i])
    x = np.array([mp.get(float(p), 0.0) for p in grid_prices], dtype=float)
    return x

# Build pairs (S_t, X_{t+1}) where S_t = X_t (Markov)
X_list, Y_list = [], []
for t in range(len(book_res) - 1):
    grid_prices_t, x_t = snapshot_on_own_grid(book_res.iloc[t], levels=LEVELS)
    x_next_on_grid_t = snapshot_on_given_grid(book_res.iloc[t+1], grid_prices_t, levels=LEVELS)
    X_list.append(x_t)
    Y_list.append(x_next_on_grid_t)

X = np.stack(X_list)  # (N-1, 2k)
Y = np.stack(Y_list)  # (N-1, 2k)
print("X,Y shapes:", X.shape, Y.shape)

3bis) Visualisation of the X,y Data

In [None]:
t = 100

grid_prices, x = snapshot_on_own_grid(book_res.iloc[t], levels=LEVELS)
k = len(grid_prices) // 2

lob = pd.DataFrame({
    "price": grid_prices,
    "bid_size": np.r_[ -x[:k], np.zeros(k) ],
    "ask_size": np.r_[ np.zeros(k), x[k:] ]
})

tick = np.min(np.diff(np.unique(lob["price"])))

fig, ax = plt.subplots(figsize=(6, 4))

ax.barh(lob["price"], -lob["bid_size"], height=0.8 * tick, label="Bids")
ax.barh(lob["price"],  lob["ask_size"], height=0.8 * tick, label="Asks")

ax.axvline(0, color="black", linewidth=1)
ax.set_xlabel("Size (bids left, asks right)")
ax.set_ylabel("Price")
ax.set_title(f"Limit Order Book — t={t}")
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()


4) Signed-sqrt normalization (paper Eq. (18))

x_norm = sign(x)*sqrt(|x|)/c

In [None]:
def signed_sqrt_norm(x, c):
    return np.sign(x) * np.sqrt(np.abs(x)) / c

def signed_sqrt_inv(xn, c):
    return np.sign(xn) * (np.abs(xn) * c)**2

# choose c from training data scale (robust)
abs_sqrt = np.sqrt(np.abs(np.concatenate([X.flatten(), Y.flatten()])))
c = np.percentile(abs_sqrt, 99.5) + 1e-8
print("Normalization constant c =", c)

Xn = signed_sqrt_norm(X, c).astype(np.float32)
Yn = signed_sqrt_norm(Y, c).astype(np.float32)

In [None]:
raw = np.concatenate([X.flatten(), Y.flatten()])
normed = signed_sqrt_norm(raw, c)

fig, axs = plt.subplots(1, 2, figsize=(10, 4))

axs[0].hist(raw, bins=200)
axs[0].set_title("Raw signed volumes")
axs[0].set_yscale("log")
axs[0].grid(alpha=0.3)

axs[1].hist(normed, bins=200)
axs[1].set_title("After signed sqrt normalization")
axs[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


5) Train/val split

In [None]:
N = len(Xn)
perm = np.random.permutation(N)
split = int(0.9 * N)
tr_idx, va_idx = perm[:split], perm[split:]

Xtr, Ytr = Xn[tr_idx], Yn[tr_idx]
Xva, Yva = Xn[va_idx], Yn[va_idx]
print("Train:", Xtr.shape, "Val:", Xva.shape)

6) Torch Dataset

In [None]:
class LobTransitionDataset(Dataset):
    def __init__(self, Xcond, Ytarget):
        self.X = torch.from_numpy(Xcond)
        self.Y = torch.from_numpy(Ytarget)
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i): return self.X[i], self.Y[i]

BATCH_SIZE = 512 if len(Xtr) >= 2048 else 128
train_loader = DataLoader(LobTransitionDataset(Xtr, Ytr), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader   = DataLoader(LobTransitionDataset(Xva, Yva), batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
