In [7]:
import nltk
from nltk.corpus import brown
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
import random
from scipy.stats import spearmanr
import pandas as pd

#############################################
# Data Preparation
#############################################

def get_corpus():

    nltk.download('brown')
    corpus = [list(map(str.lower, sent)) for sent in brown.sents()]
    return corpus

def build_vocab(corpus, min_count=5):
    word_counts = Counter()
    for sent in corpus:
        word_counts.update(sent)

    vocab = [w for w, cnt in word_counts.items() if cnt >= min_count]
    
    vocab = ["<pad>", "<unk>"] + vocab  # Ensure <pad> and <unk> have lowest indices
    
    word2id = {w: i for i, w in enumerate(vocab)}
    id2word = {i: w for w, i in word2id.items()}

    return vocab, word2id, id2word

def find_top_k_similar_words(word, k, embeddings, word2id, id2word):
    if word not in word2id:
        print(f"Word '{word}' not in vocabulary.")
        
        return []

    word_idx = word2id[word]
    word_vector = embeddings[word_idx].unsqueeze(0)  # Shape (1, embedding_dim)
    
    # Compute cosine similarity
    similarities = torch.nn.functional.cosine_similarity(word_vector, embeddings)
    
    # Get top k indices (excluding the word itself)
    top_k_indices = similarities.argsort(descending=True)[1:k+1]
    
    # Map indices back to words
    top_k_words = [id2word[idx.item()] for idx in top_k_indices]
    
    return top_k_words

#############################################
# 1. SVD Implementation
#############################################

import numpy as np
import torch
import scipy.sparse as sp
from scipy.sparse.linalg import svds



#############################################
# 4. Word Similarity Evaluation (WordSim-353)
#############################################

def evaluate_wordsim(embeddings, word2id, wordsim_file='wordsim353.csv'):
    """
    Assumes wordsim_file is a CSV with columns: 'Word 1', 'Word 2', 'Human (mean)'.
    Only evaluates pairs where both words are in word2id.
    """
    df = pd.read_csv(wordsim_file)
    computed_sims = []
    human_scores = []
    
    for _, row in df.iterrows():
        w1 = row['Word 1'].lower()  # ensure lowercase for matching
        w2 = row['Word 2'].lower()
        score = row['Human (Mean)']
        if w1 in word2id and w2 in word2id:
            vec1 = embeddings[word2id[w1]].unsqueeze(0)
            vec2 = embeddings[word2id[w2]].unsqueeze(0)
            sim = F.cosine_similarity(vec1, vec2).item()
            computed_sims.append(sim)
            human_scores.append(score)
    if computed_sims:
        correlation, _ = spearmanr(computed_sims, human_scores)
        print(f"Spearman Correlation: {correlation:.4f}")
    else:
        print("No word pairs found in embeddings for evaluation.")



In [8]:

def train_svd(corpus, vocab, window_size=2, embed_size=100, device=torch.device("cpu")):
    vocab_size = len(vocab)
    word2id = {word: i for i, word in enumerate(vocab)}
    UNK_ID = word2id["<unk>"]
    PAD_ID = word2id["<pad>"]


    # Build sparse co-occurrence matrix using coordinate format (COO)
    data = []
    rows = []
    cols = []

    for sentence in corpus:
        sentence = [word if word in word2id else "<unk>" for word in sentence]
        indices = [word2id[word] for word in sentence]

        # Pad for window size at sentence boundaries
        padded_indices = [PAD_ID] * window_size + indices + [PAD_ID] * window_size

        for i in range(window_size, len(padded_indices) - window_size):
            center = padded_indices[i]
            for j in range(-window_size, window_size + 1):
                if j != 0:  # Skip center word itself
                    rows.append(center)
                    cols.append(padded_indices[i + j])
                    data.append(1)
    from scipy.sparse import coo_matrix
    cooc_matrix = coo_matrix((data, (rows, cols)), shape=(vocab_size, vocab_size))
    
    # Compute truncated SVD on the sparse matrix
    U, s, Vt = svds(cooc_matrix, k=embed_size)
    # svds does not guarantee sorted order; sort singular values/vectors in descending order
    idx = np.argsort(s)[::-1]
    s = s[idx]
    U = U[:, idx]
    
    # Form the word embeddings (scale U by sqrt(singular values))
    embeddings = np.dot(U, np.diag(np.sqrt(s)))
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float).to(device)
    torch.save(embeddings_tensor, 'svd_embeddings.pt')
    print("SVD embeddings saved as svd_embeddings.pt")

In [9]:
# Select device: GPU if available, else CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

corpus = get_corpus()
vocab, word2id, id2word = build_vocab(corpus, min_count=5)



Using device: cpu


[nltk_data] Downloading package brown to
[nltk_data]     /home/cool_mayank/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [10]:
# 1. Train SVD-based embeddings
train_svd(corpus, vocab, window_size=2, embed_size=100, device=device)


svd_embeddings = torch.load('svd_embeddings.pt', map_location=device)
svd_word2id = {w: i for i, w in enumerate(vocab)}
print("\nEvaluating SVD Embeddings:")
evaluate_wordsim(svd_embeddings, svd_word2id, wordsim_file='wordsim353.csv')

SVD embeddings saved as svd_embeddings.pt

Evaluating SVD Embeddings:


  svd_embeddings = torch.load('svd_embeddings.pt', map_location=device)


FileNotFoundError: [Errno 2] No such file or directory: 'wordsim353.csv'

In [10]:
import torch
import numpy as np
from collections import Counter
from tqdm import tqdm

class CBOWDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, word2id, window_size=2, num_negative=5, neg_sample_pool_size=100000):
        self.data = []
        self.word2id = word2id
        self.window_size = window_size
        self.num_negative = num_negative
        self.pad_id = word2id["<pad>"]
        self.unk_id = word2id["<unk>"]
        self.neg_sample_pool_size = neg_sample_pool_size

        # Compute word frequencies for negative sampling
        self.word_freq = Counter(word for sent in corpus for word in sent if word in word2id)
        valid_words = [w for w in self.word_freq.keys() if w in word2id]

        # Build negative sampling probabilities
        total = sum(self.word_freq[w]**0.75 for w in valid_words)
        self.neg_probs = np.array([(self.word_freq[w]**0.75) / total for w in valid_words])
        self.word_list = [word2id[w] for w in valid_words]  # Convert words to IDs

        # Precompute negative samples
        self.negative_samples = np.random.choice(self.word_list, size=self.neg_sample_pool_size, p=self.neg_probs)
        self.neg_index = 0  # Pointer for sampling negatives

        # Create (context, target) pairs
        for sent in corpus:
            sent = [word if word in word2id else "<unk>" for word in sent]
            indices = [word2id[word] for word in sent]

            # Pad at sentence boundaries
            padded_indices = [self.pad_id] * window_size + indices + [self.pad_id] * window_size

            for i in range(window_size, len(padded_indices) - window_size):
                target = padded_indices[i]
                context = [padded_indices[j] for j in range(i - window_size, i + window_size + 1) if j != i]
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]

        # Fetch negative samples from the precomputed pool
        neg_end = self.neg_index + self.num_negative
        if neg_end > len(self.negative_samples):  # If exceeding buffer, reshuffle
            np.random.shuffle(self.negative_samples)
            self.neg_index = 0
            neg_end = self.num_negative
        negatives = self.negative_samples[self.neg_index:neg_end]
        self.neg_index += self.num_negative  # Move index forward

        return (torch.tensor(context, dtype=torch.long), 
                torch.tensor(target, dtype=torch.long), 
                torch.tensor(negatives, dtype=torch.long))



def cbow_collate(batch):
    contexts, targets, negatives = zip(*batch)
    contexts_padded = torch.nn.utils.rnn.pad_sequence(contexts, batch_first=True, padding_value=0)
    lengths = torch.tensor([len(c) for c in contexts])
    targets = torch.stack(targets)
    negatives = torch.stack(negatives)
    return contexts_padded, lengths, targets, negatives

class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pad_idx):
        super(CBOWModel, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.embedding_dim = embedding_dim
        self.init_embeddings()
    
    def init_embeddings(self):
        initrange = 0.5 / self.embedding_dim
        self.in_embeddings.weight.data.uniform_(-initrange, initrange)
        self.out_embeddings.weight.data.uniform_(0, 0)
    
    def forward(self, contexts, lengths, targets, negatives):
        embeds = self.in_embeddings(contexts)  # (batch, max_context_length, embed_dim)
        mask = (contexts != 0).unsqueeze(2).float()
        summed = torch.sum(embeds * mask, dim=1)
        lengths = lengths.unsqueeze(1).float()
        context_embeds = summed / lengths
        
        target_embeds = self.out_embeddings(targets)           # (batch, embed_dim)
        neg_embeds = self.out_embeddings(negatives)            # (batch, num_negative, embed_dim)
        
        pos_score = torch.sum(context_embeds * target_embeds, dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10)
        
        neg_score = torch.bmm(neg_embeds, context_embeds.unsqueeze(2)).squeeze(2)
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score) + 1e-10), dim=1)
        
        loss = pos_loss + neg_loss
        return loss.mean()


def train_cbow(corpus, word2id, embedding_dim=100, window_size=2, num_negative=5, epochs=5, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataset = CBOWDataset(corpus, word2id, window_size, num_negative)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=cbow_collate)

    pad_id = word2id["<pad>"]
    model = CBOWModel(len(word2id), embedding_dim, pad_idx=pad_id).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        total_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        for contexts, lengths, targets, negatives in progress_bar:
            contexts, lengths, targets, negatives = contexts.to(device), lengths.to(device), targets.to(device), negatives.to(device)

            optimizer.zero_grad()
            loss = model(contexts, lengths, targets, negatives)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

    
    torch.save(model.in_embeddings.state_dict(), 'cbow_embeddings_non_dyn.pt')
    print("CBOW embeddings saved as cbow_embeddings.pt")



# # 2. Train CBOW model embeddings
train_cbow(corpus, word2id, embedding_dim=100, window_size=2, num_negative=5, epochs=10, batch_size=256)
cbow_state = torch.load('cbow_embeddings_non_dyn.pt', map_location=device)

# If cbow_state is an OrderedDict, use it directly
if isinstance(cbow_state, dict):
    cbow_layer = nn.Embedding.from_pretrained(cbow_state['weight']).to(device)
else:
    cbow_layer = nn.Embedding(len(word2id) + 1, 100).to(device)
    cbow_layer.load_state_dict({'weight': cbow_state})

print("\nEvaluating CBOW Embeddings:")
evaluate_wordsim(cbow_layer.weight.data, word2id, wordsim_file='wordsim353.csv')

Epoch 1: 100%|██████████| 4536/4536 [00:55<00:00, 81.48it/s, loss=2.38]
Epoch 2: 100%|██████████| 4536/4536 [00:55<00:00, 81.53it/s, loss=1.99]
Epoch 3: 100%|██████████| 4536/4536 [00:55<00:00, 81.52it/s, loss=1.86]
Epoch 4: 100%|██████████| 4536/4536 [00:55<00:00, 81.30it/s, loss=1.77]
Epoch 5: 100%|██████████| 4536/4536 [00:56<00:00, 80.40it/s, loss=1.69]
Epoch 6: 100%|██████████| 4536/4536 [00:56<00:00, 80.71it/s, loss=1.64]
Epoch 7: 100%|██████████| 4536/4536 [00:56<00:00, 80.66it/s, loss=1.58]
Epoch 8: 100%|██████████| 4536/4536 [00:56<00:00, 80.48it/s, loss=1.54]
Epoch 9: 100%|██████████| 4536/4536 [00:57<00:00, 78.45it/s, loss=1.5] 
Epoch 10: 100%|██████████| 4536/4536 [00:56<00:00, 79.79it/s, loss=1.46]


CBOW embeddings saved as cbow_embeddings.pt

Evaluating CBOW Embeddings:
Spearman Correlation: 0.1653


In [12]:

class SkipGramDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, word2id, window_size=2, num_negative=5, neg_sample_pool_size=100000):
        self.data = []
        self.word2id = word2id
        self.window_size = window_size
        self.num_negative = num_negative
        self.neg_sample_pool_size = neg_sample_pool_size

        UNK_ID = word2id["<unk>"]
        PAD_ID = word2id["<pad>"]

        # Compute word frequencies only for words in vocabulary
        self.word_freq = Counter()
        for sent in corpus:
            for word in sent:
                word_id = word2id.get(word, UNK_ID)  # Replace OOV with <unk>
                if word_id != PAD_ID:  # Ignore <pad>
                    self.word_freq[word] += 1

        # Only keep words that exist in word2id
        valid_words = [w for w in self.word_freq.keys() if w in word2id]

        # Build negative sampling probabilities
        total = sum(self.word_freq[w]**0.75 for w in valid_words)
        self.neg_probs = np.array([(self.word_freq[w]**0.75) / total for w in valid_words])
        self.word_list = [word2id[w] for w in valid_words]  # Convert to IDs for fast lookup

        # Precompute negative samples
        self.negative_samples = np.random.choice(self.word_list, size=self.neg_sample_pool_size, p=self.neg_probs)
        self.neg_index = 0  # Pointer to track position in the pool

        # Create (center, context) pairs
        for sent in corpus:
            sent = [word if word in word2id else "<unk>" for word in sent]  # Replace OOVs
            indices = [word2id[word] for word in sent if word2id[word] != PAD_ID]  # Remove <pad>

            for i, center in enumerate(indices):
                for j in range(max(0, i - window_size), min(len(indices), i + window_size + 1)):
                    if j != i:
                        context = indices[j]
                        self.data.append((center, context))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        center, context = self.data[idx]

        # Fetch negative samples from the precomputed pool
        neg_end = self.neg_index + self.num_negative
        if neg_end > len(self.negative_samples):  # If exceeding buffer, reshuffle
            np.random.shuffle(self.negative_samples)
            self.neg_index = 0
            neg_end = self.num_negative
        negatives = self.negative_samples[self.neg_index:neg_end]
        self.neg_index += self.num_negative  # Move index forward

        return (torch.tensor(center, dtype=torch.long), 
                torch.tensor(context, dtype=torch.long), 
                torch.tensor(negatives, dtype=torch.long))


class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.init_embeddings()
    
    def init_embeddings(self):
        initrange = 0.5 / self.embedding_dim
        self.in_embeddings.weight.data.uniform_(-initrange, initrange)
        self.out_embeddings.weight.data.uniform_(0, 0)
    
    def forward(self, centers, targets, negatives):
        center_embeds = self.in_embeddings(centers)    # (batch, embed_dim)
        target_embeds = self.out_embeddings(targets)   # (batch, embed_dim)
        neg_embeds = self.out_embeddings(negatives)    # (batch, num_negative, embed_dim)
        
        pos_score = torch.sum(center_embeds * target_embeds, dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10)
        
        neg_score = torch.bmm(neg_embeds, center_embeds.unsqueeze(2)).squeeze(2)
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score) + 1e-10), dim=1)
        
        loss = pos_loss + neg_loss
        return loss.mean()
from tqdm import tqdm

def train_skipgram(corpus, word2id, embedding_dim=100, window_size=2, num_negative=2, epochs=5, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataset = SkipGramDataset(corpus, word2id, window_size, num_negative)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    print(f"Number of batches: {len(dataloader)}, number of samples: {len(dataset)}, corpus length: {len(corpus)}")
    model = SkipGramModel(len(word2id), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        total_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        
        for centers, targets, negatives in progress_bar:
            centers, targets, negatives = centers.to(device), targets.to(device), negatives.to(device)
            
            optimizer.zero_grad()
            loss = model(centers, targets, negatives)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))
        
        print(f"Skip-Gram Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")
    
    torch.save(model.in_embeddings.state_dict(), 'skipgram_embeddings.pt')
    print("Skip-Gram embeddings saved as skipgram_embeddings.pt")
    
# 3. Train Skip-Gram model embeddings
train_skipgram(corpus, word2id, embedding_dim=100, window_size=2, num_negative=5, epochs=10, batch_size=256)


Number of batches: 16803, number of samples: 4301382, corpus length: 57340


                                                                            

Skip-Gram Epoch 1, Loss: 2.3159


                                                                             

Skip-Gram Epoch 2, Loss: 2.1599


                                                                            

Skip-Gram Epoch 3, Loss: 2.1026


                                                                             

Skip-Gram Epoch 4, Loss: 2.0622


                                                                             

Skip-Gram Epoch 5, Loss: 2.0294


                                                                             

Skip-Gram Epoch 6, Loss: 2.0028


                                                                             

Skip-Gram Epoch 7, Loss: 1.9823


                                                                            

Skip-Gram Epoch 8, Loss: 1.9649


                                                                            

Skip-Gram Epoch 9, Loss: 1.9516


                                                                              

Skip-Gram Epoch 10, Loss: 1.9412
Skip-Gram embeddings saved as skipgram_embeddings.pt


In [8]:
skipgram_state = torch.load('skipgram_embeddings.pt', map_location=device)
skipgram_layer = nn.Embedding.from_pretrained(skipgram_state['weight']).to(device)

print("\nEvaluating Skip-Gram Embeddings:")
evaluate_wordsim(nn.Embedding.from_pretrained(skipgram_state['weight']).weight.data, word2id, wordsim_file='wordsim353.csv')



Evaluating Skip-Gram Embeddings:
Spearman Correlation: 0.2215


## Comparative Analysis of the Three Models

### 1. SkipGram

| Embedding Dimension | Spearman Correlation |
|---------------------|---------------------|
| 64                 | 0.1900              |
| 128                | 0.3417              |
| 256                | 0.3512              |
| 512                | 0.3128              |

**Observations:**  
- SkipGram shows a strong improvement from **64 → 128** dimensions and peaks at **256** dimensions.  
- The slight drop at **512** dims suggests overfitting, optimization issues, or diminishing returns.  
- Overall, SkipGram consistently captures semantic similarity better than the other two methods at most dimensional settings.

---

### 2. CBOW

| Embedding Dimension | Spearman Correlation |
|---------------------|---------------------|
| 64                 | 0.1750              |
| 128                | 0.1872              |
| 256                | 0.2187              |
| 512                | 0.2623              |

**Observations:**  
- CBOW improves steadily with increasing dimensions.  
- While it starts lower than SkipGram, the gap narrows at higher dimensions (**512** dims).  
- CBOW benefits from additional dimensions by gradually capturing more semantic relationships.

---

### 3. SVD (Count-based Method)

| Embedding Dimension | Spearman Correlation |
|---------------------|---------------------|
| 64                 | 0.0349              |
| 128                | 0.0783              |
| 256                | 0.1048              |
| 512                | 0.1458              |

**Observations:**  
- SVD consistently yields the lowest Spearman correlations compared to the predictive models.  
- However, it does improve steadily with larger embedding dimensions.  
- Despite improvements, it remains less effective at capturing nuanced contextual relationships compared to neural approaches.

---

## Overall Trends

### **Impact of Dimensionality**
For all models, increasing the embedding dimension tends to improve performance. However, the magnitude of improvement varies:
- **SkipGram** benefits significantly from **64 → 128** dimensions, peaking at **256**.
- **CBOW** improves steadily and becomes more competitive at **512** dimensions.
- **SVD** shows improvement but still lags behind the predictive models at all dimensions.

### **Model Comparison**
| Model  | Best Performance (Spearman) | Best Dimension |
|--------|-----------------------------|---------------|
| SkipGram | 0.3512 | 256 |
| CBOW | 0.2623 | 512 |
| SVD | 0.1458 | 512 |

- **SkipGram** leads in capturing semantic similarity, especially at moderate dimensions (**128–256**).
- **CBOW** closes the gap at higher dimensions, suggesting it benefits more from increased capacity.
- **SVD**, while improving, remains behind the neural models in effectiveness.

---

## Conclusion
The results indicate that predictive models (**SkipGram** and **CBOW**) outperform the **SVD** approach in capturing word similarity, with **SkipGram** showing the best performance at moderate embedding sizes (**128–256**). **CBOW**, while initially trailing, improves with higher dimensions. **SVD**, despite its improvements, remains less effective in this metric.

Link to all csv_results files and pt files: [text](https://drive.google.com/drive/folders/1bBFfzxD0TTrtN6nfVtOi4GVgVrE2I2vC?usp=sharing)
