<a href="https://colab.research.google.com/github/MasterBeard/EigenCluster-Tokenization-for-Financial-Transformers/blob/main/ICLR_review3_VQ_VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf

# Define index codes - smaller set for train/val
train_val_tickers = {
    'SPX': '^GSPC',     # S&P 500
    'IXIC': '^IXIC',    # NASDAQ Composite
    'HSI': '^HSI',      # Hang Seng Index
    'DJI': '^DJI',      # Dow Jones Industrial Average
    'FCHI': '^FCHI',    # CAC 40
    'DAXI': '^GDAXI',   # DAX
    'N225': '^N225',    # Nikkei 225
    'KS11': '^KS11',    # KOSPI
    'SENSEX': '^BSESN', # BSE Sensex
    'STOXX50': '^STOXX50E',  # EURO STOXX 50
    #'SSEC': '000001.SS', # Shanghai Composite
    #'SZSC': '399001.SZ', # Shenzhen Component
}

# Full set of indices for test
test_tickers = {
    # North America
    'SPX': '^GSPC',     # S&P 500
    'IXIC': '^IXIC',    # NASDAQ Composite
    'DJI': '^DJI',      # Dow Jones Industrial Average
    'RUT': '^RUT',      # Russell 2000
    #'VIX': '^VIX',      # CBOE Volatility Index
    'TSX': '^GSPTSE',   # S&P/TSX Composite (Canada)

    # Europe
    'FTSE': '^FTSE',    # FTSE 100 (UK)
    'DAXI': '^GDAXI',   # DAX (Germany)
    'CAC': '^FCHI',     # CAC 40 (France)
    'STOXX50': '^STOXX50E', # EURO STOXX 50
    'IBEX': '^IBEX',    # IBEX 35 (Spain)
    'FTMIB': 'FTSEMIB.MI', # FTSE MIB (Italy)
    'SMI': '^SSMI',     # Swiss Market Index

    # Asia
    'HSI': '^HSI',      # Hang Seng (Hong Kong)
    'N225': '^N225',    # Nikkei 225 (Japan)
    'KS11': '^KS11',    # KOSPI (South Korea)
    'TWII': '^TWII',    # TSEC weighted index (Taiwan)
    'STI': '^STI',      # Straits Times Index (Singapore)
    'JKSE': '^JKSE',    # Jakarta Composite (Indonesia)
    'SET': '^SETI',     # SET Index (Thailand)
    'NIFTY50': '^NSEI', # NIFTY 50 (India)
    'SENSEX': '^BSESN', # BSE Sensex (India)

    # Oceania
    'AXJO': '^AXJO',    # S&P/ASX 200 (Australia)
    'NZ50': '^NZ50',    # S&P/NZX 50 (New Zealand)

    # Emerging Markets
    'MERV': '^MERV',    # MERVAL (Argentina)
    'BOVESPA': '^BVSP', # Bovespa (Brazil)
    'IPC': '^MXX',      # IPC (Mexico)

    # China
    'SSEC': '000001.SS', # Shanghai Composite
    'SZSC': '399001.SZ', # Shenzhen Component

    # Global/Regional
    'EEM': 'EEM',       # MSCI Emerging Markets ETF
}

idm = -2

date_ranges = {
    'train': ("2000-01-01", "2009-12-31"),
    'val': ("2010-01-02", "2010-12-31"),
    'test': ("2011-01-01", "2020-12-31")
}
#date_ranges = {
    #'train': ("2004-01-01", "2013-12-31"),
    #'val': ("2014-01-02", "2014-12-31"),
    #'test': ("2015-01-01", "2024-12-31")
#}

# Initialize data storage
data_splits = {split: {'features': [], 'norm_features': [], 'labels': []} for split in date_ranges}

# Window length
window_size = 11

# Process train and val data (using smaller set of indices)
for split in ['train', 'val']:
    start_date, end_date = date_ranges[split]
    index_data = {}
    for name, ticker in train_val_tickers.items():
        if ticker.endswith('.SZ') or ticker.endswith('.SS'):
            index_data[name] = yf.Ticker(ticker).history(start=start_date, end=end_date, auto_adjust=True)
        else:
            index_data[name] = yf.download(ticker, start=start_date, end=end_date, auto_adjust=True)

    # Create feature vectors and labels
    for index_name, data in index_data.items():
        if data.empty:
            continue

        # Fix multi-level column names if needed
        if isinstance(data.columns, pd.MultiIndex):
            data.columns = data.columns.get_level_values(0)

        open_values = data['Open'].dropna().values
        close_values = data['Close'].dropna().values
        low_values = data['Low'].dropna().values
        high_values = data['High'].dropna().values

        for start in range(len(data) - window_size + 1):
            open_row = open_values[start:start + window_size]
            low_row = low_values[start:start + window_size]
            high_row = high_values[start:start + window_size]
            close_row = close_values[start:start + window_size]

            # Build feature vector (unnormalized)
            combined = np.array([
                val for i in range(window_size)
                for val in (open_row[i], low_row[i], high_row[i], close_row[i])
            ])

            # Normalized feature vector
            norm_combined = np.array([
                (open_row[i] / close_row[idm],
                low_row[i] / close_row[idm],
                high_row[i] / close_row[idm],
                close_row[i] / close_row[idm])
                for i in range(window_size)
            ]).flatten()

            label = 1 if close_row[-1] > close_row[idm] else 0

            data_splits[split]['features'].append(combined)
            data_splits[split]['norm_features'].append(norm_combined)
            data_splits[split]['labels'].append(label)

# Process test data (using full set of indices)
start_date, end_date = date_ranges['test']
index_data = {}
for name, ticker in test_tickers.items():
    if ticker.endswith('.SZ') or ticker.endswith('.SS'):
        index_data[name] = yf.Ticker(ticker).history(start=start_date, end=end_date, auto_adjust=True)
    else:
        index_data[name] = yf.download(ticker, start=start_date, end=end_date, auto_adjust=True)

# Create feature vectors and labels for test data
for index_name, data in index_data.items():
    if data.empty:
        continue

    if isinstance(data.columns, pd.MultiIndex):
        data.columns = data.columns.get_level_values(0)

    open_values = data['Open'].dropna().values
    close_values = data['Close'].dropna().values
    low_values = data['Low'].dropna().values
    high_values = data['High'].dropna().values

    for start in range(len(data) - window_size + 1):
        open_row = open_values[start:start + window_size]
        low_row = low_values[start:start + window_size]
        high_row = high_values[start:start + window_size]
        close_row = close_values[start:start + window_size]

        combined = np.array([
            val for i in range(window_size)
            for val in (open_row[i], low_row[i], high_row[i], close_row[i])
        ])

        norm_combined = np.array([
            (open_row[i] / close_row[idm],
            low_row[i] / close_row[idm],
            high_row[i] / close_row[idm],
            close_row[i] / close_row[idm])
            for i in range(window_size)
        ]).flatten()

        label = 1 if close_row[-1] > close_row[idm] else 0

        data_splits['test']['features'].append(combined)
        data_splits['test']['norm_features'].append(norm_combined)
        data_splits['test']['labels'].append(label)

# Convert to NumPy arrays
train_features = np.array(data_splits['train']['features'])
train_norm_features = np.array(data_splits['train']['norm_features'])
train_labels = np.array(data_splits['train']['labels'])

val_features = np.array(data_splits['val']['features'])
val_norm_features = np.array(data_splits['val']['norm_features'])
val_labels = np.array(data_splits['val']['labels'])

test_features = np.array(data_splits['test']['features'])
test_norm_features = np.array(data_splits['test']['norm_features'])
test_labels = np.array(data_splits['test']['labels'])

# Print shapes
print(f"Train features shape: {train_features.shape}")
print(f"Train norm_features shape: {train_norm_features.shape}")
print(f"Train labels shape: {train_labels.shape}")

print(f"Validation features shape: {val_features.shape}")
print(f"Validation norm_features shape: {val_norm_features.shape}")
print(f"Validation labels shape: {val_labels.shape}")

print(f"Test features shape: {test_features.shape}")
print(f"Test norm_features shape: {test_norm_features.shape}")
print(f"Test labels shape: {test_labels.shape}")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Train features shape: (23092, 44)
Train norm_features shape: (23092, 44)
Train labels shape: (23092,)
Validation features shape: (2410, 44)
Validation norm_features shape: (2410, 44)
Validation labels shape: (2410,)
Test features shape: (69348, 44)
Test norm_features shape: (69348, 44)
Test labels shape: (69348,)


In [2]:
import numpy as np
from sklearn.cluster import KMeans

# ============================================

# ============================================
def extract_normalized_daily_ohlc(matrices, epsilon=1e-6):
    """

    """
    all_windows = []

    for vec in matrices:
        reshaped = vec.reshape(-1, 4)  # shape: (11, 4)
        for i in range(1, reshaped.shape[0] - 10 + 1):
            window = reshaped[i:i + 10]  # shape: (5, 4)
            flat = window.flatten()   # shape: (20,)

            denominator = flat[-5]  #
            if abs(denominator) < epsilon:
                denominator = 1.0  #

            normalized = flat / denominator
            all_windows.append(normalized)

    return np.array(all_windows)  # shape: [N_samples × 7, 20]

train_days = extract_normalized_daily_ohlc(train_features)
val_days = extract_normalized_daily_ohlc(val_features)
test_days = extract_normalized_daily_ohlc(test_features)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# -------------------------
# Vector Quantizer
# -------------------------
class VectorQuantizer(nn.Module):
    def __init__(self, num_embeddings=256, embedding_dim=16):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim

        # Codebook K × D
        self.embedding = nn.Parameter(torch.randn(num_embeddings, embedding_dim))

    def forward(self, z_e):
        # z_e: (B, 40, D)
        B, T, D = z_e.shape
        flat = z_e.reshape(-1, D)   # (B*40, D)

        # Compute L2 distances
        distances = (
            flat.pow(2).sum(1, keepdim=True)
            - 2 * flat @ self.embedding.t()
            + self.embedding.pow(2).sum(1)
        )

        # nearest codebook entry
        encoding_inds = distances.argmin(dim=1)   # (B*40,)

        # quantized vectors
        z_q = self.embedding[encoding_inds].view(B, T, D)

        # straight-through estimator
        z_q_st = z_e + (z_q - z_e).detach()

        # commitment loss
        commit_loss = F.mse_loss(z_e, z_q.detach())

        return z_q_st, encoding_inds.view(B, T), commit_loss



# -------------------------
# Encoder: 40 → 40 tokens
# -------------------------
class Encoder(nn.Module):
    def __init__(self, embed_dim=16):
        super().__init__()
        self.linear = nn.Linear(1, embed_dim)

    def forward(self, x):
        # x: (B, 40)
        x = x.unsqueeze(-1)    # (B, 40, 1)
        return self.linear(x)  # (B, 40, D)



# -------------------------
# VQ Tokenizer (trained)
# -------------------------
class VQTokenizer(nn.Module):
    def __init__(self, num_embeddings=256, embed_dim=64):
        super().__init__()
        self.encoder = Encoder(embed_dim)
        self.vq = VectorQuantizer(num_embeddings, embed_dim)

    def forward(self, x):
        z_e = self.encoder(x)                # (B, 40, D)
        z_q, token_ids, loss = self.vq(z_e)  # (B, 40), scalar loss
        return token_ids, loss

In [None]:
# data: (23092, 40)
data = torch.tensor(train_days, dtype=torch.float32)

model = VQTokenizer(256, 64)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

batch_size = 128

for step in range(2000):
    idx = torch.randint(0, len(data), (batch_size,))
    x = data[idx]

    optimizer.zero_grad()
    tokens, commit_loss = model(x)

    loss = commit_loss

    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"step {step}, commit_loss = {loss.item():.6f}")

step 0, commit_loss = 1.180495
step 100, commit_loss = 0.868355
step 200, commit_loss = 0.639742
step 300, commit_loss = 0.471031
step 400, commit_loss = 0.345981
step 500, commit_loss = 0.253924
step 600, commit_loss = 0.184158
step 700, commit_loss = 0.133614
step 800, commit_loss = 0.096130
step 900, commit_loss = 0.068217
step 1000, commit_loss = 0.048288
step 1100, commit_loss = 0.033921
step 1200, commit_loss = 0.023306
step 1300, commit_loss = 0.016256
step 1400, commit_loss = 0.010887
step 1500, commit_loss = 0.007637
step 1600, commit_loss = 0.005222
step 1700, commit_loss = 0.003448
step 1800, commit_loss = 0.002445
step 1900, commit_loss = 0.001705


In [None]:
def to_tokens(model, x):
    with torch.no_grad():
        z_e = model.encoder(x)
        _, token_ids, _ = model.vq(z_e)
    return token_ids


tokens = to_tokens(model, data)
print(tokens.shape)       # torch.Size([23092, 40])
print(tokens[0])

torch.Size([23092, 40])
tensor([183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183,
        183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183,
        183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183])


In [None]:
tokens.max(),tokens.min()

(tensor(183), tensor(183))

In [None]:
print(model.vq.embedding.shape)

torch.Size([256, 64])


In [None]:
# simple_discrete_autoencoder_colab.py
# -------------------------------------------------------------
# A minimal discrete autoencoder for tabular data (N, 40) -> tokens.
# Turns continuous vectors of length 40 into discrete tokens via
# Gumbel-Softmax straight-through estimator.
#
# This is intentionally simple and Colab-friendly. Just paste and run.
# -------------------------------------------------------------

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# --------------------------
# Hyperparams
# --------------------------
input_dim = 40       # each row has 40 dims
hidden_dim = 128
z_dim = 64           # continuous embedding dim before quantization
num_tokens = 256     # discrete vocabulary size
batch_size = 64
lr = 1e-3
epochs = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --------------------------
# 1. Simple MLP encoder
# --------------------------
class Encoder(nn.Module):
    def __init__(self, in_dim, hidden, z_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, z_dim)
        )

    def forward(self, x):
        return self.net(x)

# --------------------------
# 2. Gumbel-Softmax quantizer
# --------------------------
class GumbelQuantizer(nn.Module):
    def __init__(self, z_dim, num_tokens):
        super().__init__()
        self.logits = nn.Linear(z_dim, num_tokens)   # map latent -> token logits
        self.embed = nn.Parameter(torch.randn(num_tokens, z_dim) * 0.1)

    def forward(self, z, temp=1.0, hard=True):
        # z: (B, z_dim)
        logits = self.logits(z)            # (B, num_tokens)
        g = F.gumbel_softmax(logits, tau=temp, hard=hard, dim=-1)
        # convert one-hot to embedding
        z_q = g @ self.embed               # (B, z_dim)
        return z_q, logits                 # return continuous quantized and logits

# --------------------------
# 3. Decoder
# --------------------------
class Decoder(nn.Module):
    def __init__(self, z_dim, hidden, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, out_dim)
        )
    def forward(self, x):
        return self.net(x)

# --------------------------
# 4. Discrete Autoencoder wrapper
# --------------------------
class DiscreteAutoencoder(nn.Module):
    def __init__(self, in_dim=40):
        super().__init__()
        self.encoder = Encoder(in_dim, hidden_dim, z_dim)
        self.quant = GumbelQuantizer(z_dim, num_tokens)
        self.decoder = Decoder(z_dim, hidden_dim, in_dim)

    def forward(self, x, temp=1.0):
        z = self.encoder(x)
        z_q, logits = self.quant(z, temp=temp, hard=True)
        x_hat = self.decoder(z_q)
        return x_hat, logits

# --------------------------

# --------------------------
# Suppose your real data is a torch.tensor of shape (23092, 40)
real_data = torch.tensor(train_days, dtype=torch.float32)

dataset = TensorDataset(real_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# --------------------------
# Training
# --------------------------
model = DiscreteAutoencoder(input_dim).to(device)
opt = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for (x,) in dataloader:
        x = x.to(device)
        opt.zero_grad()
        x_hat, logits = model(x, temp=0.7)
        loss = F.mse_loss(x_hat, x)
        loss.backward()
        opt.step()
        total_loss += loss.item() * x.size(0)

    print(f"Epoch {epoch+1}, Loss = {total_loss/len(dataset):.6f}")

print("Training finished.")

# --------------------------
# Convert the 40-d vectors into discrete tokens
# --------------------------
# After training, we can obtain discrete token ids by argmax over logits.

def encode_to_tokens(model, data):
    """
    data: (N, 40)
    return: token_ids: (N,) integers in [0, num_tokens-1]
    """
    model.eval()
    with torch.no_grad():
        z = model.encoder(data.to(device))
        logits = model.quant.logits(z)  # (N, num_tokens)
        token_ids = torch.argmax(logits, dim=-1)
        return token_ids.cpu()

# Example: get token ids for the first 5 rows
sample = real_data[:5]
tokens = encode_to_tokens(model, sample)
print("Discrete tokens:", tokens)

Epoch 1, Loss = 0.069419
Epoch 2, Loss = 0.001040
Epoch 3, Loss = 0.001040
Epoch 4, Loss = 0.001042
Epoch 5, Loss = 0.001045
Epoch 6, Loss = 0.001047
Epoch 7, Loss = 0.001058
Epoch 8, Loss = 0.001060
Epoch 9, Loss = 0.001062
Epoch 10, Loss = 0.001075
Training finished.
Discrete tokens: tensor([130, 130, 130, 130, 130])


In [None]:
tokens = encode_to_tokens(model, real_data)

In [None]:
tokens.max(),tokens.min()

(tensor(130), tensor(130))