In [2]:
import os

# List all files in the dataset folder
base_path = "/kaggle/input/crypto-order-book/crypto_order_book"
for root, dirs, files in os.walk(base_path):
    print(root)
    for f in files:
        print("  ", f)


/kaggle/input/crypto-order-book/crypto_order_book
/kaggle/input/crypto-order-book/crypto_order_book/ETH
   ETH_1min.csv
   ETH_5min.csv
   ETH_1sec.csv
/kaggle/input/crypto-order-book/crypto_order_book/ADA
   ADA_5min.csv
   ADA_1sec.csv
   ADA_1min.csv
/kaggle/input/crypto-order-book/crypto_order_book/BTC
   BTC_5min.csv
   BTC_1sec.csv
   BTC_1min.csv


In [3]:
import pandas as pd

# File paths
files = [
    "/kaggle/input/crypto-order-book/crypto_order_book/ETH/ETH_1sec.csv",
    "/kaggle/input/crypto-order-book/crypto_order_book/ADA/ADA_1sec.csv",
    "/kaggle/input/crypto-order-book/crypto_order_book/BTC/BTC_1sec.csv"
]

# Load and concatenate
dfs = [pd.read_csv(f) for f in files]
df_all = pd.concat(dfs, ignore_index=True)

print("Combined DataFrame shape:", df_all.shape)
print(df_all.head())


Combined DataFrame shape: (3092036, 156)
   Unnamed: 0                       system_time  midpoint  spread  \
0           0  2021-04-07 11:32:50.861733+00:00  1970.985    0.15   
1           1  2021-04-07 11:32:51.861733+00:00  1970.985    0.15   
2           2  2021-04-07 11:32:52.861733+00:00  1970.985    0.15   
3           3  2021-04-07 11:32:53.861733+00:00  1970.985    0.15   
4           4  2021-04-07 11:32:54.861733+00:00  1971.135    0.03   

          buys     sells  bids_distance_0  bids_distance_1  bids_distance_2  \
0     0.000000  0.000000        -0.000038        -0.000495        -0.000500   
1     0.000000  0.000000        -0.000038        -0.000495        -0.000500   
2     0.000000  0.000000        -0.000038        -0.000495        -0.000500   
3     0.000000  0.000000        -0.000038        -0.000495        -0.000500   
4  5832.007602  1.962869        -0.000008        -0.000571        -0.000576   

   bids_distance_3  ...  asks_market_notional_5  asks_market_notional

In [4]:
# Check columns of your combined dataset
print(df_all.columns)


Index(['Unnamed: 0', 'system_time', 'midpoint', 'spread', 'buys', 'sells',
       'bids_distance_0', 'bids_distance_1', 'bids_distance_2',
       'bids_distance_3',
       ...
       'asks_market_notional_5', 'asks_market_notional_6',
       'asks_market_notional_7', 'asks_market_notional_8',
       'asks_market_notional_9', 'asks_market_notional_10',
       'asks_market_notional_11', 'asks_market_notional_12',
       'asks_market_notional_13', 'asks_market_notional_14'],
      dtype='object', length=156)


In [5]:
import numpy as np

SEQ_LEN = 50  # sequence length

# Select a manageable set of features (you had 156 total, but let's use fewer first)
feature_cols = [
    "midpoint", "spread", "buys", "sells",
    "bids_distance_0", "bids_distance_1", "bids_distance_2",
    "asks_distance_0", "asks_distance_1", "asks_distance_2"
]
df_features = df_all[feature_cols].fillna(0).values

# Target: future mid-price movement
df_target = np.sign(df_all["midpoint"].shift(-1) - df_all["midpoint"])
df_target = (df_target > 0).astype(int).values  # 1 = up, 0 = down/flat

# Build sequences
X_sequences, y_sequences = [], []
for i in range(len(df_features) - SEQ_LEN):
    X_sequences.append(df_features[i:i+SEQ_LEN])
    y_sequences.append(df_target[i+SEQ_LEN])

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

print("Sequences shape:", X_sequences.shape, "Labels shape:", y_sequences.shape)


  result = getattr(ufunc, method)(*inputs, **kwargs)


Sequences shape: (3091986, 50, 10) Labels shape: (3091986,)


In [7]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# ======================
# 1. Sample data (to fit in RAM/GPU)""
# ======================
N_SAMPLES = 200_000   # keep this safe for Kaggle P100, increase later
idx = np.random.choice(len(X_sequences), size=N_SAMPLES, replace=False)
X_sample = X_sequences[idx]
y_sample = y_sequences[idx]

# ======================
# 2. Normalize features
# ======================
scaler = StandardScaler()
X_reshaped = X_sample.reshape(-1, X_sample.shape[2])  # (N*seq_len, features)
X_scaled = scaler.fit_transform(X_reshaped)
X_sample = X_scaled.reshape(X_sample.shape)  # back to (N, seq_len, features)

# ======================
# 3. Train/val split
# ======================
split = int(0.8 * len(X_sample))
X_train, X_val = X_sample[:split], X_sample[split:]
y_train, y_val = y_sample[:split], y_sample[split:]

# Convert to torch
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_val_t   = torch.tensor(X_val, dtype=torch.float32)
y_val_t   = torch.tensor(y_val, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=64, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=64)

# ======================
# 4. Model (TCN + Attention)
# ======================
class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super().__init__()
        self.chomp_size = chomp_size
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()

class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super().__init__()
        self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size, stride=stride,
                               padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size, stride=stride,
                               padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=3, dropout=0.2):
        super().__init__()
        layers = []
        for i in range(len(num_channels)):
            dilation_size = 2 ** i
            in_ch = num_inputs if i == 0 else num_channels[i-1]
            out_ch = num_channels[i]
            layers.append(TemporalBlock(in_ch, out_ch, kernel_size, stride=1,
                                        dilation=dilation_size, padding=(kernel_size-1)*dilation_size,
                                        dropout=dropout))
        self.network = nn.Sequential(*layers)
    def forward(self, x):
        return self.network(x)

class TCN_Attention(nn.Module):
    def __init__(self, num_features, num_classes=2):
        super().__init__()
        self.tcn = TemporalConvNet(num_features, [64, 64, 64], kernel_size=3, dropout=0.2)
        self.attention = nn.Sequential(
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, 1)
        )
        self.fc = nn.Linear(64, num_classes)
    def forward(self, x):
        x = x.transpose(1, 2)              # (batch, features, seq_len)
        tcn_out = self.tcn(x)              # (batch, channels, seq_len)
        tcn_out = tcn_out.transpose(1, 2)  # (batch, seq_len, channels)
        attn_weights = torch.softmax(self.attention(tcn_out), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * tcn_out, dim=1)            # (batch, channels)
        return self.fc(context)

# ======================
# 5. Training
# ======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TCN_Attention(num_features=X_train.shape[2]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            preds = torch.argmax(out, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(yb.cpu().numpy())
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")


Epoch 1/20, Loss: 0.5783, Val Acc: 0.7191, Val F1: 0.4188
Epoch 2/20, Loss: 0.5743, Val Acc: 0.7191, Val F1: 0.4196
Epoch 3/20, Loss: 0.5728, Val Acc: 0.7191, Val F1: 0.4184
Epoch 4/20, Loss: 0.5718, Val Acc: 0.7188, Val F1: 0.4210
Epoch 5/20, Loss: 0.5711, Val Acc: 0.7182, Val F1: 0.4240
Epoch 6/20, Loss: 0.5707, Val Acc: 0.7185, Val F1: 0.4206
Epoch 7/20, Loss: 0.5700, Val Acc: 0.7190, Val F1: 0.4198
Epoch 8/20, Loss: 0.5692, Val Acc: 0.7191, Val F1: 0.4191
Epoch 9/20, Loss: 0.5688, Val Acc: 0.7192, Val F1: 0.4195
Epoch 10/20, Loss: 0.5686, Val Acc: 0.7182, Val F1: 0.4257
Epoch 11/20, Loss: 0.5680, Val Acc: 0.7186, Val F1: 0.4201
Epoch 12/20, Loss: 0.5678, Val Acc: 0.7188, Val F1: 0.4206
Epoch 13/20, Loss: 0.5676, Val Acc: 0.7183, Val F1: 0.4233
Epoch 14/20, Loss: 0.5670, Val Acc: 0.7175, Val F1: 0.4240
Epoch 15/20, Loss: 0.5669, Val Acc: 0.7189, Val F1: 0.4214
Epoch 16/20, Loss: 0.5667, Val Acc: 0.7183, Val F1: 0.4253
Epoch 17/20, Loss: 0.5663, Val Acc: 0.7186, Val F1: 0.4216
Epoch 

In [None]:
import numpy as np

sequence_length = 50
sequences = []
labels = []

for i in range(len(X) - sequence_length):
    sequences.append(X[i:i+sequence_length])
    labels.append(y[i+sequence_length])

X_seq = np.array(sequences)
y_seq = np.array(labels)

print("Sequences shape:", X_seq.shape, "Labels shape:", y_seq.shape)
# Output: (num_samples, sequence_length, num_features)


In [1]:
# ===============================================
# Low-Latency Microstructure Prediction Engine
# Dataset: Crypto Limit Order Book (LOB) Example
# Model: TCN + Attention
# ===============================================

# -----------------------------
# 1. Imports
# -----------------------------
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# -----------------------------
# 2. Dataset Loader & Feature Engineering
# -----------------------------
class LOBDataset(Dataset):
    def __init__(self, df, sequence_length=50):
        self.sequence_length = sequence_length
        self.features, self.labels = self.preprocess(df)
    
    def preprocess(self, df):
        # Feature Engineering
        # Mid-price
        df['mid_price'] = (df['bid_1'] + df['ask_1']) / 2
        df['mid_price_shift'] = df['mid_price'].shift(-1)
        # Label: 1 = up, 0 = down, 2 = neutral
        df['label'] = np.where(df['mid_price_shift'] > df['mid_price'], 1,
                               np.where(df['mid_price_shift'] < df['mid_price'], 0, 2))
        
        # Select features
        feature_cols = ['bid_1','ask_1','bid_2','ask_2','bid_3','ask_3',
                        'bid_1_size','ask_1_size','bid_2_size','ask_2_size',
                        'bid_3_size','ask_3_size','mid_price']
        X = df[feature_cols].fillna(0).values
        y = df['label'].fillna(2).astype(int).values
        
        # Normalize features
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        
        # Create sequences
        sequences = []
        seq_labels = []
        for i in range(len(X) - self.sequence_length):
            sequences.append(X[i:i+self.sequence_length])
            seq_labels.append(y[i+self.sequence_length])
        return torch.tensor(np.array(sequences), dtype=torch.float32), torch.tensor(seq_labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# -----------------------------
# 3. Model: TCN + Attention
# -----------------------------
class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super().__init__()
        self.chomp_size = chomp_size
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()

class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super().__init__()
        self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)
        
        self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)
        
        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels, kernel_size=3, dropout=0.2):
        super().__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = input_size if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers.append(TemporalBlock(in_channels, out_channels, kernel_size, stride=1,
                                        dilation=dilation_size, padding=(kernel_size-1)*dilation_size,
                                        dropout=dropout))
        self.tcn = nn.Sequential(*layers)
        self.attention = nn.Sequential(
            nn.Linear(num_channels[-1], 64),
            nn.Tanh(),
            nn.Linear(64, 1),
            nn.Softmax(dim=1)
        )
        self.fc = nn.Linear(num_channels[-1], output_size)
    
    def forward(self, x):
        # x shape: batch, seq_len, features -> TCN expects batch, features, seq_len
        x = x.transpose(1,2)
        y1 = self.tcn(x)  # batch, channels, seq_len
        y1 = y1.transpose(1,2)  # batch, seq_len, channels
        attn_weights = self.attention(y1)  # batch, seq_len, 1
        y = torch.sum(attn_weights * y1, dim=1)  # weighted sum over sequence
        out = self.fc(y)
        return out

# -----------------------------
# 4. Load Dataset
# -----------------------------
# Example: Using a synthetic CSV (replace with real LOB data)
# CSV should have columns: bid_1, ask_1, bid_2, ask_2, bid_3, ask_3, bid_1_size, ask_1_size, ...
df = pd.read_csv("/kaggle/input/crypto-lob-sample.csv")
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=False)

train_dataset = LOBDataset(train_df)
val_dataset = LOBDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# -----------------------------
# 5. Train the Model
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TCN(input_size=train_dataset.features.shape[2],
            output_size=3,
            num_channels=[64,64,64,64]).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}")

# -----------------------------
# 6. Evaluate
# -----------------------------
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"Validation Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

# -----------------------------
# 7. Optional: TorchScript Export for Low-Latency
# -----------------------------
scripted_model = torch.jit.script(model)
scripted_model.save("tcn_attn_model.pt")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/crypto-lob-sample.csv'