In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
import matplotlib.pyplot as plt
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from sklearn.preprocessing import RobustScaler
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset, DataLoader



In [2]:
df = pd.read_parquet('arbitrage_features.parquet')

In [3]:
df['spread1'] = df['near_price'] - df['far_price']
raw_features = df[['spread1', 'turnover_ratio', 'vol_curve', 'tick_imbalance']].dropna()
df['future_spread_change'] = df['spread'].shift(-5) - df['spread']
df['signal'] = np.where(df['future_spread_change'] > 0.002, 1,
                 np.where(df['future_spread_change'] < -0.002, -1, 0))

In [4]:
scaler = RobustScaler(quantile_range=(25, 75))
scaled_features = scaler.fit_transform(raw_features)

In [5]:
def create_sequences(data, window_size=60, step_size=10):
    sequences = []
    for i in range(0, len(data) - window_size, step_size):
        sequences.append(data[i:i+window_size])
    return np.array(sequences)

# 60 timestamps（1min ticks）
X = create_sequences(scaled_features, window_size=60)
y = df['signal'][60:-5:10].values

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_tensor = torch.FloatTensor(X)
y_tensor = torch.LongTensor(y)

split_idx = int(len(X_tensor) * 0.8)
train_X, val_X = X_tensor[:split_idx], X_tensor[split_idx:]
train_y, val_y = y_tensor[:split_idx], y_tensor[split_idx:]

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64)

In [7]:
class ArbitrageTransformer(nn.Module):
    def __init__(self, input_dim=4, d_model=64, nhead=8, num_layers=4, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward=128, dropout=dropout)
        self.transformer = TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Linear(d_model, 3)  # 3 outputs：-1, 0, 1

    def forward(self, src, src_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer(src, src_mask)
        last_output = output[-1]
        return self.decoder(last_output)
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # create (max_len, d_model) position encoding matrix
        position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))  # (d_model/2,)

        pe = torch.zeros(max_len, 1, d_model)  # (max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)  # even position
        pe[:, 0, 1::2] = torch.cos(position * div_term)  # odd position

        self.register_buffer('pe', pe)  

    def forward(self, x):
        x = x + self.pe[:x.size(0)] 
        return self.dropout(x)

In [8]:
def create_mask(seq, volatility_threshold=0.1):
    seq_len, _ = seq.shape
    mask = torch.ones(seq_len, seq_len, dtype=torch.bool)

    for i in range(seq_len):
        if seq[i].mean().item() > volatility_threshold:
            mask[i, max(0, i-10):i+1] = False
        else:
            mask[i, :i+1] = False 
    
    return mask.to(seq.device)


In [9]:
model = ArbitrageTransformer().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3,
                                                steps_per_epoch=len(train_loader), epochs=20)
loss_fn = nn.CrossEntropyLoss()



In [10]:
def train_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch_X, batch_y in loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        mask = create_mask(batch_X[:, :, 1].transpose(0, 1))
        batch_X = batch_X.transpose(0, 1)
        optimizer.zero_grad()
        output = model(batch_X, src_mask=mask)
        loss = loss_fn(output, batch_y + 1)  # shift label from [-1,0,1] to [0,1,2]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            mask = create_mask(batch_X[:, :, 1].transpose(0, 1))
            output = model(batch_X.transpose(0, 1), src_mask=mask)
            loss = loss_fn(output, batch_y + 1)
            total_loss += loss.item()
            pred = output.argmax(dim=1)
            correct += (pred == (batch_y + 1)).sum().item()
    return total_loss / len(loader), correct / len(loader.dataset)

In [11]:
best_val_loss = float('inf')
for epoch in range(10):
    train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
    val_loss, val_acc = evaluate(model, val_loader, loss_fn, device)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model_class3.pth")
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")

Epoch 1: Train Loss=1.0748, Val Loss=1.0814
Epoch 2: Train Loss=1.0715, Val Loss=1.0817
Epoch 3: Train Loss=1.0706, Val Loss=1.0816
Epoch 4: Train Loss=1.0697, Val Loss=1.0815
Epoch 5: Train Loss=1.0693, Val Loss=1.0816
Epoch 6: Train Loss=1.0692, Val Loss=1.0816
Epoch 7: Train Loss=1.0692, Val Loss=1.0817
Epoch 8: Train Loss=1.0692, Val Loss=1.0817
Epoch 9: Train Loss=1.0692, Val Loss=1.0817
Epoch 10: Train Loss=1.0691, Val Loss=1.0817


In [12]:
model = ArbitrageTransformer().to(device)
model.load_state_dict(torch.load("best_model_class3.pth"))
model.eval()

all_preds = []
with torch.no_grad():
    for i in range(0, len(X_tensor), 256):
        batch = X_tensor[i:i+256].transpose(0, 1)  # shape: (seq_len, batch, features)
        output = model(batch)  # shape: (batch, 3)
        pred = output.argmax(dim=1).cpu().numpy() - 1  # map back to [-1, 0, 1]
        all_preds.extend(pred)

signals = np.array(all_preds)

backtest_df = df.iloc[60:-5:10].copy() 
backtest_df = backtest_df.iloc[:len(signals)].copy()
backtest_df['signal'] = signals
spread_change = backtest_df['spread1'].pct_change(5).shift(-5)  
backtest_df['strategy_return'] = backtest_df['signal'] * spread_change
backtest_df['strategy_return'] = backtest_df['strategy_return'].fillna(0)

cumulative_return = (1 + backtest_df['strategy_return']).cumprod()
max_drawdown = (cumulative_return / cumulative_return.cummax() - 1).min()
annual_return = cumulative_return.iloc[-1]**(252*24*6 / len(cumulative_return)) - 1
volatility = backtest_df['strategy_return'].std() * np.sqrt(252*24*6)
sharpe_ratio = annual_return / volatility

print(f"Retuen: {(cumulative_return.iloc[-1]-1):.2%}")
print(f"Max Drawdown: {max_drawdown:.2%}")
print(f"Sharpe: {sharpe_ratio:.2f}")



Return: 8.42%
Max Drawdown: 9.14%
Sharpe: 0.86


  sharpe_ratio = annual_return / volatility
