
# Word2Vec: Vanilla vs RSR (Human Behavioral Similarity)

## Experiment Pipeline

**Goal**: Compare two Word2Vec models on downstream category prediction:
1. **Vanilla**: Trained ONLY on Wikipedia corpus
2. **RSR**: Trained on Wikipedia + Human similarity judgments (4.7M triplets)

### Pipeline Steps:
1. Load Wikipedia corpus
2. Load human behavioral similarity matrix  
3. Load THINGS concepts & category labels
4. Train Word2Vec **VANILLA** (Wikipedia only)
5. Train Word2Vec **RSR** (Wikipedia + human similarity)
6. Compare both on THINGS category prediction task



## Step 1: Imports & Configuration

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import scipy.io as sio

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Configuration
BASE_DATA_DIR = "data"

# THINGS paths
THINGS_CONCEPTS_PATH = os.path.join(BASE_DATA_DIR, "02_object-level", "_concepts-metadata_things.tsv")
PROPERTY_RATINGS_PATH = os.path.join(BASE_DATA_DIR, "02_object-level", "_property-ratings.tsv")
THINGS_FEATURES_PATH = os.path.join(BASE_DATA_DIR, "03_category-level", "category27_manual.tsv")

# Behavioral similarity path
BEHAVIORAL_SIM_PATH = os.path.join(BASE_DATA_DIR, "osfstorage-archive", "data", "spose_similarity.mat")

# Preprocessed corpus path
CORPUS_PATH = os.path.join(BASE_DATA_DIR, "simplewiki_preprocessed.pkl")

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


## Step 2: Load Wikipedia Corpus

In [2]:
# Load preprocessed Simple Wikipedia corpus
print("="*70)
print("STEP 2: Loading Wikipedia Corpus")
print("="*70)

with open(CORPUS_PATH, 'rb') as f:
    sentences = pickle.load(f)

print(f"  Source: Simple Wikipedia (preprocessed)")
print(f"  Sentences: {len(sentences):,}")
print(f"  Sample: {sentences[0][:10]}...")

STEP 2: Loading Wikipedia Corpus
  Source: Simple Wikipedia (preprocessed)
  Sentences: 1,688,343
  Sample: ['april', 'apr', 'is', 'the', 'fourth', 'month', 'of', 'the', 'year', 'in']...


## Step 3: Load Human Behavioral Similarity Matrix

In [3]:
print("="*70)
print("STEP 3: Loading Human Behavioral Similarity Matrix")
print("="*70)

# Load full behavioral similarity matrix (1854 x 1854 THINGS concepts)
behav_data = sio.loadmat(BEHAVIORAL_SIM_PATH)
behav_sim_full = behav_data['spose_sim']

print(f"  Source: 4.7 million human triplet judgments")
print(f"  Matrix shape: {behav_sim_full.shape}")
print(f"  Similarity range: [{behav_sim_full.min():.3f}, {behav_sim_full.max():.3f}]")
print(f"  Mean similarity: {behav_sim_full.mean():.3f}")

STEP 3: Loading Human Behavioral Similarity Matrix
  Source: 4.7 million human triplet judgments
  Matrix shape: (1854, 1854)
  Similarity range: [0.052, 1.000]
  Mean similarity: 0.334


## Step 4: Load THINGS Concepts & Category Labels

In [4]:
print("="*70)
print("STEP 4: Loading THINGS Concepts & Category Labels")
print("="*70)

# Load THINGS concepts
concepts_df = pd.read_csv(THINGS_CONCEPTS_PATH, sep="\t")
concepts = concepts_df["Word"].tolist()
print(f"  THINGS concepts: {len(concepts)}")

# Load Category27 labels (downstream task)
cats_df = pd.read_csv(THINGS_FEATURES_PATH, sep="\t")
feature_cols = cats_df.columns.tolist()
Y_all = cats_df[feature_cols].values.astype(np.float32)
print(f"  Category labels: {Y_all.shape} (27 binary categories)")

# Load property ratings (for filtering)
prop_df_raw = pd.read_csv(PROPERTY_RATINGS_PATH, sep="\t")
all_num_cols = prop_df_raw.select_dtypes(include=[np.number]).columns.tolist()
sem_cols = [c for c in all_num_cols if c.endswith("_mean") and not c.startswith("N_") and "work_time" not in c]
prop_df = prop_df_raw.groupby("Word")[sem_cols].mean()
print(f"  Property ratings: {prop_df.shape}")

STEP 4: Loading THINGS Concepts & Category Labels
  THINGS concepts: 1854
  Category labels: (1854, 27) (27 binary categories)
  Property ratings: (1823, 22)


## Step 5: Build Vocabulary & Align Data

In [5]:
print("="*70)
print("STEP 5: Building Vocabulary & Aligning Data")
print("="*70)

# Build vocabulary from corpus
MIN_COUNT = 5
print(f"\nBuilding vocabulary (min_count={MIN_COUNT})...")

word_counts = Counter()
for sent in tqdm(sentences, desc="Counting words"):
    word_counts.update(sent)

vocab = sorted([w for w, c in word_counts.items() if c >= MIN_COUNT])
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}
vocab_size = len(vocab)

# Compute unigram distribution for negative sampling
word_freqs = np.array([word_counts[w] for w in vocab], dtype=np.float32)
word_freqs = word_freqs ** 0.75
word_probs = word_freqs / word_freqs.sum()

print(f"  Vocabulary size: {vocab_size:,}")
print(f"  Total tokens: {sum(word_counts.values()):,}")

# Helper function to find word in vocabulary
def get_word_idx(word, word2idx):
    candidates = [word, word.lower(), word.replace(" ", "_"), 
                  word.lower().replace(" ", "_"), word.replace(" ", ""), 
                  word.lower().replace(" ", "")]
    for token in candidates:
        if token in word2idx:
            return word2idx[token]
    return None

# Align THINGS concepts with vocabulary
print(f"\nAligning THINGS concepts with vocabulary...")
valid_concepts = []
valid_word_indices = []
Y_rows = []
valid_things_indices = []

for idx, concept in enumerate(concepts):
    if concept not in prop_df.index:
        continue
    word_idx = get_word_idx(concept, word2idx)
    if word_idx is None:
        continue
    valid_word_indices.append(word_idx)
    Y_rows.append(Y_all[idx])
    valid_concepts.append(concept)
    valid_things_indices.append(idx)

valid_word_indices = np.array(valid_word_indices)
Y = np.stack(Y_rows, axis=0).astype(np.float32)

# Extract aligned similarity matrix
behav_sim_subset = behav_sim_full[np.ix_(valid_things_indices, valid_things_indices)]
behav_sim_target = torch.tensor(behav_sim_subset, dtype=torch.float32, device=DEVICE)

print(f"\n{'='*70}")
print(f"ALIGNED DATASET:")
print(f"{'='*70}")
print(f"  Valid THINGS concepts: {len(valid_concepts)}")
print(f"  Category labels Y: {Y.shape}")
print(f"  Similarity matrix: {behav_sim_subset.shape}")

STEP 5: Building Vocabulary & Aligning Data

Building vocabulary (min_count=5)...


Counting words: 100%|██████████| 1688343/1688343 [00:02<00:00, 758686.21it/s]


  Vocabulary size: 104,458
  Total tokens: 27,446,826

Aligning THINGS concepts with vocabulary...

ALIGNED DATASET:
  Valid THINGS concepts: 1386
  Category labels Y: (1386, 27)
  Similarity matrix: (1386, 1386)


## Step 6: Define Model & Training Functions

In [6]:
class SkipGramWord2Vec(nn.Module):
    """PyTorch Skip-gram Word2Vec with negative sampling."""
    
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        init_range = 0.5 / embedding_dim
        self.target_embeddings.weight.data.uniform_(-init_range, init_range)
        self.context_embeddings.weight.data.uniform_(-init_range, init_range)
    
    def forward(self, targets, contexts):
        t_emb = self.target_embeddings(targets)
        c_emb = self.context_embeddings(contexts)
        return torch.sum(t_emb * c_emb, dim=1)

# ============================================================================
# Sample pairs on-the-fly instead of pre-generating 220M pairs...
# ============================================================================

def preprocess_sentences(sentences, word2idx):
    """Convert sentences to index arrays (do once, reuse)."""
    indexed = []
    for sent in tqdm(sentences, desc="Indexing sentences"):
        indices = [word2idx[w] for w in sent if w in word2idx]
        if len(indices) >= 2:
            indexed.append(np.array(indices, dtype=np.int32))
    return indexed

def sample_batch(indexed_sentences, batch_size, window_size, vocab_size, neg_probs_np):
    """Sample a batch of (target, context, negatives) on-the-fly."""
    targets = []
    contexts = []
    
    # Sample random sentences and extract pairs
    sent_indices = np.random.randint(0, len(indexed_sentences), batch_size * 2)
    
    for sent_idx in sent_indices:
        sent = indexed_sentences[sent_idx]
        if len(sent) < 2:
            continue
        
        # Random position in sentence
        pos = np.random.randint(0, len(sent))
        target = sent[pos]
        
        # Random context within window
        start = max(0, pos - window_size)
        end = min(len(sent), pos + window_size + 1)
        context_positions = [j for j in range(start, end) if j != pos]
        
        if context_positions:
            ctx_pos = context_positions[np.random.randint(0, len(context_positions))]
            targets.append(target)
            contexts.append(sent[ctx_pos])
        
        if len(targets) >= batch_size:
            break
    
    return np.array(targets[:batch_size]), np.array(contexts[:batch_size])

# ============================================================================
# HYPERPARAMETERS
# ============================================================================
EMBEDDING_DIM = 300
WINDOW_SIZE = 5
NEG_SAMPLES = 5

# Batch size: Typical values are 32-256 for Word2Vec. Larger batches:
# - Pros: Better GPU utilisation, more stable gradients, fewer kernel launches
# - Cons: Less frequent updates, may need more epochs, less stochasticity
# 128 is a good middle ground - standard in practice and still fast
BATCH_SIZE = 128

# Number of batches per epoch. With batch_size=128, this gives ~1.28M samples/epoch
# Adjust this to control training time vs coverage (more batches = more samples seen)
BATCHES_PER_EPOCH = 10000

W2V_EPOCHS = 5              
W2V_LR = 0.001
LAMBDA_RSR = 1.0

print(f"{'='*70}")
print("MODEL CONFIGURATION:")
print(f"{'='*70}")
print(f"  Embedding dim: {EMBEDDING_DIM}")
print(f"  Window size: {WINDOW_SIZE}")
print(f"  Negative samples: {NEG_SAMPLES}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Batches per epoch: {BATCHES_PER_EPOCH}")
print(f"  Samples per epoch: ~{BATCH_SIZE * BATCHES_PER_EPOCH:,}")
print(f"  Epochs: {W2V_EPOCHS}")
print(f"  Learning rate: {W2V_LR}")
print(f"{'='*70}")

MODEL CONFIGURATION:
  Embedding dim: 300
  Window size: 5
  Negative samples: 5
  Batch size: 128
  Batches per epoch: 10000
  Samples per epoch: ~1,280,000
  Epochs: 5
  Learning rate: 0.001


## Step 7: Train VANILLA Word2Vec (Wikipedia Only)

In [None]:
print("="*70)
print("STEP 7: Training VANILLA Word2Vec (Wikipedia Only)")
print("="*70)
print("Using on-the-fly sampling")
print("="*70 + "\n")

# Preprocess sentences ONCE (convert to indices)
print("Preprocessing sentences (one-time)...")
indexed_sentences = preprocess_sentences(sentences, word2idx)
print(f"  Indexed {len(indexed_sentences):,} sentences")

# Negative sampling distribution (numpy for fast sampling)
neg_probs_np = word_probs
neg_probs_torch = torch.tensor(word_probs, device=DEVICE)
valid_idx_tensor = torch.LongTensor(valid_word_indices).to(DEVICE)

# Pre-allocate tensors for speed
pos_labels = torch.ones(BATCH_SIZE, device=DEVICE)
neg_labels = torch.zeros(BATCH_SIZE * NEG_SAMPLES, device=DEVICE)

# Create VANILLA model
vanilla_model = SkipGramWord2Vec(vocab_size, EMBEDDING_DIM).to(DEVICE)
vanilla_optimizer = optim.Adam(vanilla_model.parameters(), lr=W2V_LR)
loss_fn = nn.BCEWithLogitsLoss()

print(f"\nTraining... (~{BATCHES_PER_EPOCH * W2V_EPOCHS:,} iterations total)")
print(f"Expected time: ~{BATCHES_PER_EPOCH * W2V_EPOCHS // 500} minutes\n")

# Training loop
for epoch in range(W2V_EPOCHS):
    total_loss = 0
    
    pbar = tqdm(range(BATCHES_PER_EPOCH), desc=f"Vanilla Epoch {epoch+1}/{W2V_EPOCHS}")
    for batch_idx in pbar:
        # Sample batch on-the-fly (FAST!)
        targets_np, contexts_np = sample_batch(
            indexed_sentences, BATCH_SIZE, WINDOW_SIZE, vocab_size, neg_probs_np
        )
        
        # To GPU
        targets = torch.LongTensor(targets_np).to(DEVICE)
        contexts = torch.LongTensor(contexts_np).to(DEVICE)
        
        # Positive scores
        pos_scores = vanilla_model(targets, contexts)
        
        # Negative samples
        neg_contexts = torch.multinomial(neg_probs_torch, len(targets) * NEG_SAMPLES, replacement=True)
        neg_targets = targets.repeat_interleave(NEG_SAMPLES)
        neg_scores = vanilla_model(neg_targets, neg_contexts)
        
        # Loss
        all_scores = torch.cat([pos_scores, neg_scores])
        all_labels = torch.cat([pos_labels[:len(targets)], neg_labels[:len(targets)*NEG_SAMPLES]])
        loss = loss_fn(all_scores, all_labels)
        
        vanilla_optimizer.zero_grad()
        loss.backward()
        vanilla_optimizer.step()
        
        total_loss += loss.item()
        if batch_idx % 100 == 0:
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    print(f"Epoch {epoch+1} | Avg Loss: {total_loss/BATCHES_PER_EPOCH:.4f}")

# Extract vanilla embeddings
X_vanilla = vanilla_model.target_embeddings(valid_idx_tensor).detach().cpu().numpy()
print(f"\n Vanilla training complete! Embeddings: {X_vanilla.shape}")

STEP 7: Training VANILLA Word2Vec (Wikipedia Only)
Using on-the-fly sampling

Preprocessing sentences (one-time)...


Indexing sentences: 100%|██████████| 1688343/1688343 [00:03<00:00, 525042.25it/s]


  Indexed 1,687,117 sentences

Training... (~50,000 iterations total)
Expected time: ~100 minutes



Vanilla Epoch 1/5:  33%|███▎      | 3303/10000 [00:31<01:03, 106.16it/s, loss=0.4513]

## Step 8: Train RSR Word2Vec (Wikipedia + Human Similarity)

In [None]:
print("="*70)
print("STEP 8: Training RSR Word2Vec (Wikipedia + Human Similarity)")
print("="*70)
print("Using random sampling + RSR regularisation")
print("="*70 + "\n")

# Create RSR model (fresh initialisation with random weights)
rsr_model = SkipGramWord2Vec(vocab_size, EMBEDDING_DIM).to(DEVICE)
rsr_optimizer = optim.Adam(rsr_model.parameters(), lr=W2V_LR)

# RSR applied every N batches
RSR_EVERY_N_BATCHES = 50

print(f"Training with RSR every {RSR_EVERY_N_BATCHES} batches...")
print(f"Expected time: ~{BATCHES_PER_EPOCH * W2V_EPOCHS // 400} minutes\n")

# Training loop
for epoch in range(W2V_EPOCHS):
    total_w2v_loss, total_rsr_loss = 0, 0
    rsr_count = 0
    
    pbar = tqdm(range(BATCHES_PER_EPOCH), desc=f"RSR Epoch {epoch+1}/{W2V_EPOCHS}")
    for batch_idx in pbar:
        # Sample batch on-the-fly
        targets_np, contexts_np = sample_batch(
            indexed_sentences, BATCH_SIZE, WINDOW_SIZE, vocab_size, neg_probs_np
        )
        
        targets = torch.LongTensor(targets_np).to(DEVICE)
        contexts = torch.LongTensor(contexts_np).to(DEVICE)
        
        # Skip-gram loss
        pos_scores = rsr_model(targets, contexts)
        neg_contexts = torch.multinomial(neg_probs_torch, len(targets) * NEG_SAMPLES, replacement=True)
        neg_targets = targets.repeat_interleave(NEG_SAMPLES)
        neg_scores = rsr_model(neg_targets, neg_contexts)
        
        all_scores = torch.cat([pos_scores, neg_scores])
        all_labels = torch.cat([pos_labels[:len(targets)], neg_labels[:len(targets)*NEG_SAMPLES]])
        L_w2v = loss_fn(all_scores, all_labels)
        
        # RSR loss (periodically)
        L_rsr = torch.tensor(0.0, device=DEVICE)
        if batch_idx % RSR_EVERY_N_BATCHES == 0:
            things_emb = rsr_model.target_embeddings(valid_idx_tensor)
            things_emb_norm = F.normalize(things_emb, p=2, dim=1)
            sim_matrix = things_emb_norm @ things_emb_norm.T
            L_rsr = F.mse_loss(sim_matrix, behav_sim_target)
            rsr_count += 1
        
        # Combined loss
        L_total = L_w2v + LAMBDA_RSR * L_rsr
        
        rsr_optimizer.zero_grad()
        L_total.backward()
        rsr_optimizer.step()
        
        total_w2v_loss += L_w2v.item()
        total_rsr_loss += L_rsr.item()
        
        if batch_idx % 100 == 0:
            pbar.set_postfix({'w2v': f'{L_w2v.item():.4f}', 'rsr': f'{L_rsr.item():.4f}'})
    
    avg_w2v = total_w2v_loss / BATCHES_PER_EPOCH
    avg_rsr = total_rsr_loss / max(1, rsr_count)
    print(f"Epoch {epoch+1} | W2V: {avg_w2v:.4f} | RSR: {avg_rsr:.4f}")

# Extract RSR embeddings
X_rsr = rsr_model.target_embeddings(valid_idx_tensor).detach().cpu().numpy()
print(f"\n RSR training complete! Embeddings: {X_rsr.shape}")


STEP 8: Training RSR Word2Vec (Wikipedia + Human Similarity)
Using FAST on-the-fly sampling + RSR regularization

Training with RSR every 50 batches...
Expected time: ~125 minutes



RSR Epoch 1/5: 100%|██████████| 10000/10000 [06:58<00:00, 23.92it/s, w2v=0.3580, rsr=0.0050]


Epoch 1 | W2V: 0.3789 | RSR: 0.0079


RSR Epoch 2/5: 100%|██████████| 10000/10000 [05:44<00:00, 29.02it/s, w2v=0.3482, rsr=0.0054]


Epoch 2 | W2V: 0.3526 | RSR: 0.0052


RSR Epoch 3/5: 100%|██████████| 10000/10000 [04:09<00:00, 40.07it/s, w2v=0.3447, rsr=0.0054]


Epoch 3 | W2V: 0.3461 | RSR: 0.0054


RSR Epoch 4/5: 100%|██████████| 10000/10000 [04:09<00:00, 40.06it/s, w2v=0.3444, rsr=0.0054]


Epoch 4 | W2V: 0.3425 | RSR: 0.0054


RSR Epoch 5/5: 100%|██████████| 10000/10000 [04:12<00:00, 39.56it/s, w2v=0.3371, rsr=0.0054]

Epoch 5 | W2V: 0.3401 | RSR: 0.0054

✓ RSR training complete! Embeddings: (1386, 300)





## Step 9: Compare on Downstream Task (THINGS Category Prediction)

In [None]:
print("="*70)
print("STEP 9: Comparing Models on THINGS Category Prediction")
print("="*70)
print("Task: Predict 27 binary category labels from embeddings")
print("Method: Logistic regression with 80/20 train/test split")
print("="*70 + "\n")

def evaluate_embeddings(X, Y, C=1.0, test_size=0.2, random_state=42):
    """Evaluate embeddings on category prediction. Returns mean F1."""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    f1_scores = []
    for f in range(Y.shape[1]):
        y = Y[:, f]
        if np.all(y == y[0]):
            continue
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=test_size, random_state=random_state, stratify=y
        )
        clf = LogisticRegression(C=C, max_iter=1000)
        clf.fit(X_train, y_train)
        f1_scores.append(f1_score(y_test, clf.predict(X_test)))
    
    return float(np.mean(f1_scores)), float(np.std(f1_scores))

# Evaluate both models
vanilla_f1, vanilla_std = evaluate_embeddings(X_vanilla, Y)
rsr_f1, rsr_std = evaluate_embeddings(X_rsr, Y)

# Results
print("="*70)
print("RESULTS: THINGS Category Prediction (F1 Score)")
print("="*70)
print(f"  VANILLA (Wikipedia only):     F1 = {vanilla_f1:.3f} ± {vanilla_std:.3f}")
print(f"  RSR (Wikipedia + Human Sim):  F1 = {rsr_f1:.3f} ± {rsr_std:.3f}")
print("="*70)

delta = rsr_f1 - vanilla_f1
print(f"\n{'='*70}")
if delta > 0.01:
    print(f"✓ RSR IMPROVED performance by {delta:.3f} F1 points!")
    print(f"  Human similarity judgments help category prediction!")
elif delta > 0:
    print(f"~ Slight improvement: Δ = {delta:.3f} F1")
else:
    print(f"✗ No improvement: Δ = {delta:.3f} F1")
print("="*70)

STEP 9: Comparing Models on THINGS Category Prediction
Task: Predict 27 binary category labels from embeddings
Method: Logistic regression with 80/20 train/test split

RESULTS: THINGS Category Prediction (F1 Score)
  VANILLA (Wikipedia only):     F1 = 0.340 ± 0.260
  RSR (Wikipedia + Human Sim):  F1 = 0.573 ± 0.253

✓ RSR IMPROVED performance by 0.233 F1 points!
  Human similarity judgments help category prediction!


## Bonus: Nearest Neighbor Analysis

In [None]:
from numpy.linalg import norm

def cosine_sim(a, b):
    return float(np.dot(a, b) / (norm(a) * norm(b) + 1e-8))

def nearest_neighbors(word, embeddings_dict, k=5):
    if word not in embeddings_dict:
        return []
    vec = embeddings_dict[word]
    sims = [(w, cosine_sim(vec, v)) for w, v in embeddings_dict.items() if w != word]
    return sorted(sims, key=lambda x: x[1], reverse=True)[:k]

# Create word -> embedding dicts
vanilla_dict = {w: X_vanilla[i] for i, w in enumerate(valid_concepts)}
rsr_dict = {w: X_rsr[i] for i, w in enumerate(valid_concepts)}

# Compare nearest neighbors
test_words = ["cat", "dog", "car", "hammer", "apple"]

print("\n" + "="*70)
print("NEAREST NEIGHBOR COMPARISON")
print("="*70 + "\n")

for word in test_words:
    if word not in vanilla_dict:
        continue
    print(f"=== '{word}' ===")
    print("VANILLA:", [w for w, _ in nearest_neighbors(word, vanilla_dict, 5)])
    print("RSR:    ", [w for w, _ in nearest_neighbors(word, rsr_dict, 5)])
    print()


NEAREST NEIGHBOR COMPARISON

=== 'cat' ===
VANILLA: ['dog', 'badger', 'mouse', 'owl', 'poodle']
RSR:     ['meerkat', 'chipmunk', 'warthog', 'pug', 'mongoose']

=== 'dog' ===
VANILLA: ['cat', 'poodle', 'pig', 'sled', 'hyena']
RSR:     ['poodle', 'meerkat', 'pug', 'alpaca', 'mongoose']

=== 'car' ===
VANILLA: ['truck', 'limousine', 'motorcycle', 'minivan', 'engine']
RSR:     ['minivan', 'jeep', 'limousine', 'sidecar', 'hearse']

=== 'hammer' ===
VANILLA: ['nail', 'scarecrow', 'goblet', 'anvil', 'stirrup']
RSR:     ['pliers', 'chisel', 'screwdriver', 'sledgehammer', 'crowbar']

=== 'apple' ===
VANILLA: ['blackberry', 'rhubarb', 'lemon', 'laptop', 'juice']
RSR:     ['blackberry', 'mango', 'cantaloupe', 'mulberry', 'guacamole']



## Save Models

In [None]:
# Save both models for later use
import os
os.makedirs("results", exist_ok=True)

torch.save({
    'model_state_dict': vanilla_model.state_dict(),
    'vocab_size': vocab_size,
    'embedding_dim': EMBEDDING_DIM,
    'word2idx': word2idx,
    'idx2word': idx2word,
}, "results/vanilla_word2vec.pt")

torch.save({
    'model_state_dict': rsr_model.state_dict(),
    'vocab_size': vocab_size,
    'embedding_dim': EMBEDDING_DIM,
    'word2idx': word2idx,
    'idx2word': idx2word,
}, "results/rsr_word2vec.pt")

print("✓ Models saved to results/")

✓ Models saved to results/
