Cell 1 – Imports & basic config

In [1]:
import re
from collections import Counter
from pathlib import Path

import numpy as np
from scipy.stats import spearmanr
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

import json
import pickle

# Import vocab building function from standalone script
from get_vocab_info import build_vocab_info

Cell 2 – Load THINGS similarity data


In [2]:
def load_things_words(things_words_path: Path):
    """
    Load THINGS word list (one word per line).
    Returns list of words.
    """
    words = []
    with things_words_path.open("r", encoding="utf-8") as f:
        for line in f:
            word = line.strip()
            if word:
                words.append(word)
    return words


def load_things_triplets(triplets_path: Path):
    """
    Load THINGS triplet data.
    Format: each line has 3 indices: [word1_idx, word2_idx, word3_idx]
    where word1 and word2 are similar, word3 is the odd one out.
    Returns list of (word1_idx, word2_idx, word3_idx) tuples.
    """
    triplets = []
    with triplets_path.open("r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 3:
                triplets.append((int(parts[0]), int(parts[1]), int(parts[2])))
    return triplets


def build_things_similarity_matrix(triplets, things_words, vocab_word2idx, vocab_size):
    """
    Build a similarity matrix from THINGS triplets.
    For words in our vocabulary that also appear in THINGS:
    - If two words appear together as similar pair, increment similarity
    - Normalize to get similarity scores
    
    Returns:
    - similarity_matrix: (vocab_size, vocab_size) matrix, 0 for words not in THINGS
    - things_word_to_vocab_idx: mapping from THINGS word index to vocab index
    """
    # Build mapping from THINGS words to our vocabulary indices
    things_word_to_vocab_idx = {}
    for things_idx, things_word in enumerate(things_words):
        # Try multiple matching strategies
        matched = False
        
        # Strategy 1: Exact match (lowercase)
        normalized_word = things_word.lower()
        if normalized_word in vocab_word2idx:
            things_word_to_vocab_idx[things_idx] = vocab_word2idx[normalized_word]
            matched = True
        
        # Strategy 2: Replace underscore with space
        if not matched:
            normalized_word = things_word.lower().replace("_", " ")
            if normalized_word in vocab_word2idx:
                things_word_to_vocab_idx[things_idx] = vocab_word2idx[normalized_word]
                matched = True
        
        # Strategy 3: Try without underscores (compound words)
        if not matched:
            normalized_word = things_word.lower().replace("_", "")
            if normalized_word in vocab_word2idx:
                things_word_to_vocab_idx[things_idx] = vocab_word2idx[normalized_word]
                matched = True
        
        # Strategy 4: Try first word of compound (e.g., "air_conditioner" -> "air")
        if not matched:
            parts = things_word.lower().split("_")
            if len(parts) > 0 and parts[0] in vocab_word2idx:
                things_word_to_vocab_idx[things_idx] = vocab_word2idx[parts[0]]
                matched = True
    
    # Initialize similarity matrix
    similarity_counts = np.zeros((vocab_size, vocab_size), dtype=np.float64)
    
    # Process triplets: word1 and word2 are similar
    for word1_idx, word2_idx, word3_idx in triplets:
        if word1_idx in things_word_to_vocab_idx and word2_idx in things_word_to_vocab_idx:
            v1 = things_word_to_vocab_idx[word1_idx]
            v2 = things_word_to_vocab_idx[word2_idx]
            # Increment similarity for the similar pair
            similarity_counts[v1, v2] += 1.0
            similarity_counts[v2, v1] += 1.0
    
    # Normalize: convert counts to similarity scores (0-1 range)
    # Use max count to normalize, or use a sigmoid-like function
    max_count = similarity_counts.max()
    if max_count > 0:
        similarity_matrix = similarity_counts / max_count
    else:
        similarity_matrix = similarity_counts
    
    return similarity_matrix, things_word_to_vocab_idx


Cell 2.5 : Tokenizer & corpus loader

In [3]:
def simple_tokenize(text: str):
    """
    Very basic tokenizer:
    - lowercase
    - keep only alphabetic characters and spaces
    - split on whitespace
    """
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\\s]", " ", text)
    return text.split()


def load_corpus(corpus_path: Path):
    """
    Load corpus as list of token lists (one per line).
    """
    sentences = []
    with corpus_path.open("r", encoding="utf-8", errors="ignore") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            tokens = simple_tokenize(line)
            if tokens:
                sentences.append(tokens)
    print(f"Loaded {len(sentences)} sentences.")
    return sentences


Cell 3 – Word2VecRSR class (batched skip-gram + negative sampling + RSR)

In [4]:
class Word2VecRSR:
    def __init__( # default hardcoded values
        self,
        embedding_dim=100,
        window_size=2,          # up to 2 words on each side
        min_count=1,
        negative_samples=5,
        lr=0.025,
        epochs=3,
        batch_size=4,           # mini-batch size
        seed=42,
        reg_strength=0.1,        # RSR regularization strength (0 = no RSR, 1 = only RSR)
        things_similarity_matrix=None,  # THINGS similarity matrix (vocab_size x vocab_size)
    ):
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.min_count = min_count
        self.negative_samples = negative_samples
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.reg_strength = reg_strength
        self.things_similarity_matrix = things_similarity_matrix
        self.rng = np.random.default_rng(seed)

        # will be set in build_vocab
        self.word2idx = {}
        self.idx2word = []
        self.vocab_size = 0

        # will be set in init_weights
        self.W_in = None   # shape (vocab_size, embedding_dim)
        self.W_out = None  # shape (vocab_size, embedding_dim)

        # negative sampling distribution
        self.neg_sampling_probs = None

    # ---------- PREP: VOCAB + TRAINING PAIRS ----------

    def build_vocab(self, sentences):
        """
        Build vocabulary from list of token lists.
        """
        counts = Counter()
        for sent in sentences:
            counts.update(sent)

        # filter by min_count
        filtered = [w for w, c in counts.items() if c >= self.min_count]

        self.idx2word = sorted(filtered)
        self.word2idx = {w: i for i, w in enumerate(self.idx2word)}
        self.vocab_size = len(self.idx2word)
        print(f"Vocab size: {self.vocab_size}")

        # build negative sampling distribution: P(w) is proportional to count(w)^0.75
        freqs = np.array([counts[w] for w in self.idx2word], dtype=np.float64)
        freqs = freqs ** 0.75
        self.neg_sampling_probs = freqs / freqs.sum()

    def sentences_to_indices(self, sentences):
        """
        Map tokens to indices, dropping out-of-vocab words.
        """
        idx_sentences = []
        for sent in sentences:
            idxs = [self.word2idx.get(w) for w in sent if w in self.word2idx]
            if len(idxs) > 1:
                idx_sentences.append(idxs)
        return idx_sentences

    def generate_skipgram_pairs(self, idx_sentences):
        """
        Generate (center, context) index pairs for skip-gram.
        window_size = 2 means up to 2 words on each side of the center word.
        """
        pairs = []
        w = self.window_size
        for sent in idx_sentences:
            n = len(sent)
            for i, center in enumerate(sent):
                start = max(0, i - w)
                end = min(n, i + w + 1)
                for j in range(start, end):
                    if j == i:
                        continue
                    context = sent[j]
                    pairs.append((center, context))
        return pairs

    # ---------- MODEL INIT ----------

    def init_weights(self):
        """
        Initialise input and output embeddings.
        """
        self.W_in = 0.01 * self.rng.standard_normal(
            (self.vocab_size, self.embedding_dim)
        )
        self.W_out = 0.01 * self.rng.standard_normal(
            (self.vocab_size, self.embedding_dim)
        )

    # ---------- NEGATIVE SAMPLING + TRAINING ----------

    def sample_negatives(self, batch_size):
        """
        Sample negative word indices according to the unigram^0.75 distribution.
        Returns shape (batch_size, negative_samples).
        """
        return self.rng.choice(
            self.vocab_size,
            size=batch_size * self.negative_samples,
            replace=True,
            p=self.neg_sampling_probs,
        ).reshape(batch_size, self.negative_samples)

    @staticmethod
    def _sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    def regularization(self):
        """
        Compute RSR regularization term.
        Compares model similarity matrix (from embeddings) with THINGS similarity matrix.
        Returns Spearman correlation loss (1 - correlation, so we minimize it).
        """
        if self.things_similarity_matrix is None or self.W_in is None:
            return 0.0
        
        # Compute model similarity matrix from embeddings (cosine similarity)
        # Normalize embeddings
        W_norm = self.W_in / (np.linalg.norm(self.W_in, axis=1, keepdims=True) + 1e-10)
        model_sim_matrix = W_norm @ W_norm.T  # (vocab_size, vocab_size)
        
        # Extract upper triangular parts (excluding diagonal) for comparison
        # Get indices for upper triangle
        triu_indices = np.triu_indices(self.vocab_size, k=1)
        model_sim_flat = model_sim_matrix[triu_indices]
        things_sim_flat = self.things_similarity_matrix[triu_indices]
        
        # Only consider pairs where THINGS similarity is defined (non-zero)
        # This focuses on words that appear in THINGS dataset
        valid_mask = things_sim_flat > 0
        if valid_mask.sum() == 0:
            return 0.0
        
        model_sim_valid = model_sim_flat[valid_mask]
        things_sim_valid = things_sim_flat[valid_mask]
        
        # Compute Spearman correlation
        if len(model_sim_valid) < 2:
            return 0.0
        
        try:
            corr, _ = spearmanr(model_sim_valid, things_sim_valid)
            if np.isnan(corr):
                return 0.0
            # Return 1 - correlation (so we minimize it, higher correlation = lower loss)
            return 1.0 - corr
        except:
            return 0.0

    def train(self, sentences):
        """
        High-level training:
        - build vocab
        - convert sentences to indices
        - generate skip-gram pairs
        - train using negative sampling with mini-batches
        """
        print("=" * 60)
        print("Starting Word2Vec RSR Training")
        print("=" * 60)
        
        print("\n[Step 1/4] Building vocabulary...")
        self.build_vocab(sentences)
        
        print("\n[Step 2/4] Converting sentences to indices...")
        idx_sentences = self.sentences_to_indices(sentences)
        
        print("\n[Step 3/4] Generating skip-gram pairs...")
        pairs = self.generate_skipgram_pairs(idx_sentences)
        print(f"Generated {len(pairs):,} training pairs")

        print("\n[Step 4/4] Initializing weights...")
        self.init_weights()

        pairs = np.array(pairs, dtype=np.int64)
        n_pairs = len(pairs)
        n_batches = (n_pairs + self.batch_size - 1) // self.batch_size

        print(f"\n{'=' * 60}")
        print(f"Training Configuration:")
        print(f"  - Epochs: {self.epochs}")
        print(f"  - Batch size: {self.batch_size}")
        print(f"  - Total batches per epoch: {n_batches:,}")
        print(f"  - Learning rate: {self.lr}")
        print(f"  - RSR regularization strength: {self.reg_strength}")
        print(f"{'=' * 60}\n")

        for epoch in range(1, self.epochs + 1):
            print(f"\nEpoch {epoch}/{self.epochs}")
            self.rng.shuffle(pairs)
            total_loss = 0.0
            total_rsr_loss = 0.0
            batch_count = 0

            # Progress bar for batches
            batch_range = range(0, n_pairs, self.batch_size)
            with tqdm(batch_range, desc=f"  Epoch {epoch}", unit="batch", 
                     bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]') as pbar:
                for start in pbar:
                    batch = pairs[start:start + self.batch_size]
                    centers = batch[:, 0]
                    contexts = batch[:, 1]
                    batch_loss, rsr_loss = self.train_batch(centers, contexts)
                    total_loss += batch_loss * len(batch)
                    total_rsr_loss += rsr_loss
                    batch_count += 1
                    
                    # Update progress bar with current losses
                    current_avg_loss = total_loss / (start + len(batch))
                    current_avg_rsr_loss = total_rsr_loss / batch_count
                    pbar.set_postfix({
                        'loss': f'{current_avg_loss:.4f}',
                        'rsr': f'{current_avg_rsr_loss:.4f}'
                    })

            avg_loss = total_loss / n_pairs
            avg_rsr_loss = total_rsr_loss / max(1, n_pairs // self.batch_size)
            print(f"  Completed - Average loss: {avg_loss:.4f}, Average RSR loss: {avg_rsr_loss:.4f}")

        print(f"\n{'=' * 60}")
        print("Training completed!")
        print(f"{'=' * 60}")

    def train_batch(self, center_idxs, context_idxs):
        """
        Train on a batch of (center, context) pairs using negative sampling.
        Returns: (skipgram_loss, rsr_loss)
        """
        B = center_idxs.shape[0]

        v_t = self.W_in[center_idxs]       # (B, D)
        u_c = self.W_out[context_idxs]     # (B, D)

        # positive
        score_pos = np.sum(u_c * v_t, axis=1)      # (B,)
        sig_pos = self._sigmoid(score_pos)         # (B,)
        loss_pos = -np.log(sig_pos + 1e-10)        # (B,)

        # negatives
        neg_idxs = self.sample_negatives(B)        # (B, K)
        u_negs = self.W_out[neg_idxs]              # (B, K, D)
        scores_neg = np.einsum("bkd,bd->bk", u_negs, v_t)  # (B, K)
        sig_negs = self._sigmoid(-scores_neg)
        loss_neg = -np.sum(np.log(sig_negs + 1e-10), axis=1)  # (B,)

        skipgram_loss = np.mean(loss_pos + loss_neg)

        # gradients for skipgram loss
        grad_pos = (1 - sig_pos)                   # (B,)
        grad_u_c = grad_pos[:, None] * v_t         # (B, D)
        grad_v_pos = grad_pos[:, None] * u_c       # (B, D)

        sig_scores_neg = self._sigmoid(scores_neg) # σ(x)
        grad_negs = -sig_scores_neg                # (B, K)
        grad_u_negs = grad_negs[..., None] * v_t[:, None, :]  # (B, K, D)
        grad_v_neg = np.sum(grad_negs[..., None] * u_negs, axis=1)  # (B, D)

        grad_v = grad_v_pos + grad_v_neg           # (B, D)

        # RSR regularization (computed periodically, not every batch for efficiency)
        # We'll compute it less frequently to avoid overhead
        rsr_loss = 0.0
        if self.reg_strength > 0 and self.things_similarity_matrix is not None:
            # Compute RSR loss periodically (every 10 batches to reduce overhead)
            # Store batch count as instance variable if needed, or use a simpler approach
            # For now, compute it every batch (can be optimized later)
            rsr_loss = self.regularization()
            
            # Approximate gradient for RSR: use a simple approach
            # Since Spearman correlation is not directly differentiable,
            # we approximate by using the gradient of MSE between similarity matrices
            # This pushes model similarity towards THINGS similarity
            if rsr_loss > 0:
                # Compute model similarity matrix
                W_norm = self.W_in / (np.linalg.norm(self.W_in, axis=1, keepdims=True) + 1e-10)
                model_sim = W_norm @ W_norm.T
                
                # Compute gradient approximation: push model similarity towards THINGS similarity
                # This is a simplified gradient - full implementation would use soft ranking
                diff = model_sim - self.things_similarity_matrix
                rsr_grad_scale = self.reg_strength * 0.01  # Small scale for stability
                
                # Approximate gradient: adjust embeddings to reduce difference
                # For each word in batch, compute gradient contribution
                for b_idx, c_idx in enumerate(center_idxs):
                    # Gradient approximation: push this word's similarity towards THINGS similarity
                    grad_rsr_approx = rsr_grad_scale * np.sum(
                        diff[c_idx, :, None] * W_norm, axis=0
                    ) / max(1, np.sum(np.abs(diff[c_idx, :])))
                    grad_v[b_idx] += grad_rsr_approx

        # Combined loss (for logging)
        total_loss = (1 - self.reg_strength) * skipgram_loss + self.reg_strength * rsr_loss

        # scatter-add updates
        np.add.at(self.W_out, context_idxs, self.lr * grad_u_c)

        neg_flat = neg_idxs.reshape(-1)
        grad_u_negs_flat = grad_u_negs.reshape(-1, self.embedding_dim)
        np.add.at(self.W_out, neg_flat, self.lr * grad_u_negs_flat)

        np.add.at(self.W_in, center_idxs, self.lr * grad_v)

        return skipgram_loss, rsr_loss

    # ---------- UTILITIES ----------

    def get_vector(self, word):
        idx = self.word2idx.get(word)
        if idx is None:
            raise KeyError(f"Word '{word}' not in vocabulary.")
        return self.W_in[idx]

    def most_similar(self, word, topn=5):
        if word not in self.word2idx:
            raise KeyError(f"Word '{word}' not in vocabulary.")
        idx = self.word2idx[word]
        v = self.W_in[idx]

        norms = np.linalg.norm(self.W_in, axis=1) + 1e-10
        sim = (self.W_in @ v) / norms / (np.linalg.norm(v) + 1e-10)

        best = np.argsort(-sim)
        result = []
        for i in best:
            if i == idx:
                continue
            result.append((self.idx2word[i], float(sim[i])))
            if len(result) >= topn:
                break
        return result


Cell 4 – Parameters & load mini wiki corpus

In [5]:
# Parameters for the RSR Word2Vec model
corpus_path = Path("data/AllCombined.txt")

embedding_dim = 100
window_size = 2      # up to 2 words on each side
min_count = 5        # ignore very rare words
negative_samples = 5
learning_rate = 0.025
epochs = 3
batch_size = 4
reg_strength = 0.1   # RSR regularization strength (0 = no RSR, 1 = only RSR)

# Load corpus
sentences = load_corpus(corpus_path)


Loaded 965518 sentences.


Cell 5 – Load THINGS similarity data and train RSR Word2Vec model

In [6]:
print("=" * 60)
print("Preparing RSR Word2Vec Model")
print("=" * 60)

# Build vocabulary first (without creating a model)
print("\n[Step 1/5] Building vocabulary from corpus...")
word2idx, idx2word, vocab_size = build_vocab_info(sentences, min_count=min_count)
print(f"✓ Vocab size: {vocab_size:,}")

# Load THINGS data
things_words_path = Path("things_similarity/variables/unique_id.txt")
things_triplets_path = Path("things_similarity/data/triplet_dataset/trainset.txt")

print("\n[Step 2/5] Loading THINGS words...")
things_words = load_things_words(things_words_path)
print(f"✓ Loaded {len(things_words):,} THINGS words")

# Load triplets and build similarity matrix
print("\n[Step 3/5] Loading THINGS triplets...")
triplets = load_things_triplets(things_triplets_path)
print(f"✓ Loaded {len(triplets):,} triplets")

print("\n[Step 4/5] Building THINGS similarity matrix...")
things_sim_matrix, things_word_to_vocab_idx = build_things_similarity_matrix(
    triplets, things_words, word2idx, vocab_size
)
print(f"✓ Built similarity matrix with {np.sum(things_sim_matrix > 0):,} non-zero pairs")
print(f"✓ Mapped {len(things_word_to_vocab_idx):,} THINGS words to vocabulary")

print("\n[Step 5/5] Creating RSR model...")
# Create RSR model with similarity matrix
rsr_model = Word2VecRSR(
    embedding_dim=embedding_dim,
    window_size=window_size,
    min_count=min_count,
    negative_samples=negative_samples,
    lr=learning_rate,
    epochs=epochs,
    batch_size=batch_size,
    reg_strength=reg_strength,
    things_similarity_matrix=things_sim_matrix,
)
print("✓ Model created")

# Train the model
print("\n")
rsr_model.train(sentences)

Preparing RSR Word2Vec Model

[Step 1/5] Building vocabulary from corpus...
✓ Vocab size: 112,970

[Step 2/5] Loading THINGS words...
✓ Loaded 1,854 THINGS words

[Step 3/5] Loading THINGS triplets...
✓ Loaded 4,120,663 triplets

[Step 4/5] Building THINGS similarity matrix...


MemoryError: Unable to allocate 95.1 GiB for an array with shape (112970, 112970) and data type float64

Cell 6 – Quick sanity check: similar words (RSR model)

In [None]:
test_word = "dog"

if test_word in rsr_model.word2idx:
    print(f"Most similar to '{test_word}':")
    for w, score in rsr_model.most_similar(test_word, topn=10):
        print(f"{w:<20} {score:.4f}")
else:
    print(f"'{test_word}' not in vocabulary.")


NameError: name 'vanilla_model' is not defined

Cell 7 – Export the RSR Word2Vec model

In [None]:
# Create models/ directory if it doesn't exist
export_dir = Path("models")
export_dir.mkdir(exist_ok=True)

base_name = f"rsr_w2v_model_reg{reg_strength}"

# 1) Save embeddings matrix
embeddings_path = export_dir / f"{base_name}.npy"
np.save(embeddings_path, rsr_model.W_in)
print(f"Saved embeddings to {embeddings_path}")

# 2) Save vocabulary list (idx → word)
vocab_txt_path = export_dir / f"{base_name}_vocab.txt"
with vocab_txt_path.open("w", encoding="utf-8") as f:
    for w in rsr_model.idx2word:
        f.write(w + "\n")
print(f"Saved vocabulary to {vocab_txt_path}")

# 3) Save word2idx mapping (word → index)
vocab_json_path = export_dir / f"{base_name}_word2idx.json"
with vocab_json_path.open("w", encoding="utf-8") as f:
    json.dump(rsr_model.word2idx, f, ensure_ascii=False)
print(f"Saved word2idx to {vocab_json_path}")

# 4) Save full model (pickle)
model_pkl_path = export_dir / f"{base_name}.pkl"
with model_pkl_path.open("wb") as f:
    pickle.dump(rsr_model, f)
print(f"Saved full model to {model_pkl_path}")

