$$HDC\ Text\ Baseline: European\ Languages$$

# Setup

In [1]:
!pip -q install torch-hd

import torch, torch.nn as nn, torch.nn.functional as F
import torch.utils.data as data
from torchhd import functional, embeddings
from torchhd.datasets import EuropeanLanguages as Languages
import re, time, os

def set_seed(seed=123):
    import random, numpy as np
    random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(123)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/361.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.0/361.0 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h

device(type='cuda')

In [2]:
DIMENSIONS     = 10_000
MAX_INPUT_SIZE = 128
BATCH_SIZE     = 256
PADDING_IDX    = 0
PRINT_EVERY    = 100

ASCII_A = ord("a")
ASCII_Z = ord("z")
ASCII_SPACE = ord(" ")
NUM_TOKENS = (ASCII_Z - ASCII_A + 1) + 1 + 1  # letters + space + PAD slot

def char2int(char: str) -> int:
    a = ord(char)
    if a == ASCII_SPACE:
        return (ASCII_Z - ASCII_A + 1)
    if ASCII_A <= a <= ASCII_Z:
        return a - ASCII_A
    return (ASCII_Z - ASCII_A + 1)  # map non a–z to space

def transform(x: str) -> torch.Tensor:
    x = x.lower()
    x = re.sub(r"\s+", " ", x)
    x = x[:MAX_INPUT_SIZE]
    ids = [char2int(ch) + 1 for ch in x]  # shift by +1 so PAD is 0
    if len(ids) < MAX_INPUT_SIZE:
        ids += [PADDING_IDX] * (MAX_INPUT_SIZE - len(ids))
    return torch.tensor(ids, dtype=torch.long)


# Data

In [3]:
train_ds = Languages("/content/data", train=True,  transform=transform, download=True)
test_ds  = Languages("/content/data", train=False, transform=transform, download=True)

train_ld = data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=(DEVICE.type=="cuda"))
test_ld  = data.DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=(DEVICE.type=="cuda"))

len(train_ds), len(test_ds), train_ds.classes


Downloading...
From: https://drive.google.com/uc?id=1zCvjPf0R5pOR46CNBNMM60b_LwQKvltI
To: /content/data/language-recognition/data.zip
100%|██████████| 10.3M/10.3M [00:00<00:00, 27.5MB/s]


Files already downloaded and verified


(210032,
 21000,
 ['Bulgarian',
  'Czech',
  'Danish',
  'Dutch',
  'German',
  'English',
  'Estonian',
  'Finnish',
  'French',
  'Greek',
  'Hungarian',
  'Italian',
  'Latvian',
  'Lithuanian',
  'Polish',
  'Portuguese',
  'Romanian',
  'Slovak',
  'Slovenian',
  'Spanish',
  'Swedish'])

# Model

## Initialize

In [4]:
class Model(nn.Module):
    def __init__(self, num_classes, vocab_size, dim, padding_idx=0):
        super().__init__()
        self.symbol = embeddings.Random(vocab_size, dim, padding_idx=padding_idx)
        self.classify = nn.Linear(dim, num_classes, bias=False)
        with torch.no_grad():
            self.classify.weight.zero_()

    @torch.no_grad()
    def encode(self, x_ids: torch.Tensor) -> torch.Tensor:
        # We rely on TorchHD's ngrams (n=3) and hard_quantize, identical to the example.
        symbols = self.symbol(x_ids)                 # [B, T, D]
        hv = functional.ngrams(symbols, n=3)         # [B, D]
        hv = functional.hard_quantize(hv)            # sign -> {-1,+1}
        return hv

    def forward(self, x_ids: torch.Tensor) -> torch.Tensor:
        enc = self.encode(x_ids)                     # [B, D]
        return self.classify(enc)                    # [B, C]

model = Model(len(train_ds.classes), NUM_TOKENS, DIMENSIONS, padding_idx=PADDING_IDX).to(DEVICE)

## Train prototypes

In [6]:
# Train: single-pass prototype accumulation
t0 = time.time()
with torch.no_grad():
    for bi, (samples, labels) in enumerate(train_ld, 1):
        samples = samples.to(DEVICE, non_blocking=True)
        labels  = labels.to(DEVICE, non_blocking=True)
        samples_hv = model.encode(samples)                          # [B, D], bipolar
        model.classify.weight.index_add_(0, labels, samples_hv)     # accumulate into class rows
        if bi % PRINT_EVERY == 0:
            print(f"[train] {bi}/{len(train_ld)}")
            print(f"  |  elapsed: {time.time() - t0:.1f}s")

    # Normalize class rows (cosine-like scoring)
    model.classify.weight[:] = F.normalize(model.classify.weight, dim=1)
    print(f"Total Time Elapsed: {time.time() - t0:.1f}s")

  hv = functional.hard_quantize(hv)            # sign -> {-1,+1}


[train] 100/821
  |  elapsed: 11.1s
[train] 200/821
  |  elapsed: 22.1s
[train] 300/821
  |  elapsed: 33.1s
[train] 400/821
  |  elapsed: 44.1s
[train] 500/821
  |  elapsed: 55.0s
[train] 600/821
  |  elapsed: 66.0s
[train] 700/821
  |  elapsed: 77.0s
[train] 800/821
  |  elapsed: 87.9s
Total Time Elapsed: 90.3s


In [9]:
prototypes = model.classify.weight
print("Prototypes shape:", prototypes.shape)
print("First prototype (for the first class):", prototypes[0])

Prototypes shape: torch.Size([21, 10000])
First prototype (for the first class): tensor([ 0.0043, -0.0015, -0.0012,  ..., -0.0026,  0.0051,  0.0028],
       device='cuda:0', grad_fn=<SelectBackward0>)


In [15]:
# torch.save(prototypes, "prototypes_EULanguage.pt")

## Evaluate

In [None]:
# torch.load("prototypes_EULanguage.pt")

In [8]:
# Test
correct = total = 0
with torch.no_grad():
    t1 = time.time()
    for bi, (samples, labels) in enumerate(test_ld, 1):
        samples = samples.to(DEVICE, non_blocking=True)
        labels  = labels.to(DEVICE, non_blocking=True)
        outputs = model(samples)
        preds   = outputs.argmax(dim=-1)
        correct += (preds == labels).sum().item()
        total   += labels.numel()
        if bi % PRINT_EVERY == 0:
            print(f"[test ] {bi}/{len(test_ld)}")
acc = correct / total
print(f"Test accuracy (EU languages): {acc:.4f}  |  elapsed: {time.time() - t0:.1f}s")


  hv = functional.hard_quantize(hv)            # sign -> {-1,+1}


Test accuracy (EU languages): 0.9733  |  elapsed: 250.6s
