## **NLP A1**

### **1.2 Modify the Word2Vec (with & without negative sampling) and GloVe from the lab lecture**

In [17]:
# Load all modules
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
from nltk.corpus import reuters
import nltk
import matplotlib.pyplot as plt

In [18]:
# Download NLTK data (‡∏£‡∏±‡∏ô‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÅ‡∏£‡∏Å)
nltk.download('reuters')
nltk.download('punkt')
nltk.download('punkt_tab')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


[nltk_data] Downloading package reuters to C:\Users\Legion 5
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Legion 5
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Legion 5
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [19]:
# 1. LOAD REAL CORPUS (Reuters News)

def load_reuters_corpus(max_sents=10000):
    """Load Reuters news as List[List[str]]"""
    sents = []
    for fileid in reuters.fileids():
        for sent in reuters.sents(fileid):
            tokens = [w.lower() for w in sent if w.isalpha() and len(w) >= 2]
            if len(tokens) >= 3:
                sents.append(tokens)
        if max_sents and len(sents) >= max_sents:
            break
    return sents

corpus = load_reuters_corpus(max_sents=10000)
print(f"Loaded {len(corpus)} sentences")

Loaded 10007 sentences


In [20]:
# 2. VOCAB + WORD2INDEX (‡πÉ‡∏ä‡πâ‡∏£‡πà‡∏ß‡∏° Word2Vec/GloVe)

flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab.append('UNK')
vocab = vocab[:20000]  # ‡∏à‡∏≥‡∏Å‡∏±‡∏î vocab ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏ó‡∏£‡∏ô‡πÄ‡∏£‡πá‡∏ß

word2index = {w: i for i, w in enumerate(vocab)}
index2word = {i: w for w, i in word2index.items()}
voc_size = len(vocab)
print(f"Vocab size: {voc_size}")

Vocab size: 12650


In [21]:
# 3. WORD2VEC FUNCTIONS (Dynamic Window)

def build_skipgrams(corpus, word2index, window_size=2):
    """‡∏™‡∏£‡πâ‡∏≤‡∏á skip-gram pairs ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î ‡∏£‡∏±‡∏ö window_size"""
    pairs = []
    for sent in corpus:
        indices = [word2index.get(w, word2index['UNK']) for w in sent]
        for i, center in enumerate(indices):
            start = max(0, i - window_size)
            end = min(len(indices), i + window_size + 1)
            for j in range(start, end):
                if j != i:
                    pairs.append((center, indices[j]))
    return pairs

def randombatch_with_neg(batch_size, skipgrams, neg_probs, num_negatives=5):
    """‡∏™‡∏∏‡πà‡∏° batch ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö negative sampling"""
    indices = np.random.choice(len(skipgrams), batch_size, replace=False)
    centers, positives, negatives = [], [], []
    for idx in indices:
        c, p = skipgrams[idx]
        centers.append(c)
        positives.append(p)
        negs = np.random.choice(len(neg_probs), size=num_negatives, p=neg_probs)
        negatives.append(negs)
    return (
        torch.from_numpy(np.array(centers, dtype=np.int64)),
        torch.from_numpy(np.array(positives, dtype=np.int64)),
        torch.from_numpy(np.array(negatives, dtype=np.int64))
    )

# Negative sampling distribution
word_counts = np.zeros(voc_size)
for sent in corpus:
    for w in sent:
        idx = word2index.get(w, word2index['UNK'])
        word_counts[idx] += 1
neg_sampling_probs = word_counts ** 0.75 / (word_counts ** 0.75).sum()

In [22]:
# 4. GLOVE FUNCTIONS (Dynamic Window)

def build_cooccurrence(corpus, window_size=2):
    """‡∏™‡∏£‡πâ‡∏≤‡∏á co-occurrence matrix ‡∏£‡∏±‡∏ö window_size"""
    cooc = Counter()
    for sent in corpus:
        for i, w in enumerate(sent):
            start = max(0, i - window_size)
            end = min(len(sent), i + window_size + 1)
            for j in range(start, end):
                if j != i:
                    cooc[(w, sent[j])] += 1
    return cooc

def weighting(wi, wj, Xik, xmax=100, alpha=0.75):
    """GloVe weighting function"""
    x = Xik.get((wi, wj), 1)
    if x < xmax:
        return (x / xmax) ** alpha
    return 1.0

def randombatch_glove(batch_size, cooc, weightingdic):
    """‡∏™‡∏∏‡πà‡∏° batch ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö GloVe"""
    pairs = list(cooc.keys())
    indices = np.random.choice(len(pairs), batch_size, replace=False)
    inputs, targets, coocs, weightings = [], [], [], []
    for idx in indices:
        wi, wj = pairs[idx]
        i, j = word2index.get(wi, 0), word2index.get(wj, 0)
        inputs.append(i)
        targets.append(j)
        coocs.append(np.log(cooc[(wi, wj)]))
        weightings.append(weighting(wi, wj, cooc))
    return (
        np.array(inputs, dtype=np.int64),
        np.array(targets, dtype=np.int64),
        np.array(coocs, dtype=np.float32),
        np.array(weightings, dtype=np.float32)
    )


In [23]:
# 5. MODELS

class SkipgramNegSampling(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.v_embed = nn.Embedding(vocab_size, emb_size)
        self.u_embed = nn.Embedding(vocab_size, emb_size)

    def forward(self, center_words, pos_words, neg_words):
        batch_size = center_words.size(0)
        v = self.v_embed(center_words)
        u_pos = self.u_embed(pos_words)
        u_neg = self.u_embed(neg_words)

        pos_score = torch.sum(v * u_pos, dim=1)
        neg_score = torch.bmm(u_neg, v.unsqueeze(2)).squeeze(2)

        pos_loss = F.logsigmoid(pos_score)
        neg_loss = F.logsigmoid(-neg_score).sum(1)
        loss = -(pos_loss + neg_loss).mean()
        return loss

class GloVeModel(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.v_embed = nn.Embedding(vocab_size, emb_size)
        self.u_embed = nn.Embedding(vocab_size, emb_size)
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)

    def forward(self, center_words, target_words, coocs, weighting):
        v = self.v_embed(center_words)
        u = self.u_embed(target_words)
        v_bias = self.v_bias(center_words).squeeze(1)
        u_bias = self.u_bias(target_words).squeeze(1)

        inner_prod = torch.sum(v * u, dim=1)
        loss = weighting * (inner_prod + v_bias + u_bias - coocs) ** 2
        return loss.sum()


In [24]:
# 6. TRAINING (Word2Vec NS)

window_size = 2
skipgrams = build_skipgrams(corpus, word2index, window_size)
print(f"Skipgrams: {len(skipgrams)} pairs")

emb_size = 100
batch_size = 256
num_epochs = 2000

model_ns = SkipgramNegSampling(voc_size, emb_size).to(device)
optimizer_ns = optim.Adam(model_ns.parameters(), lr=0.001)

print("Training Word2Vec (Negative Sampling)...")
for epoch in range(num_epochs):
    centers, positives, negatives = randombatch_with_neg(
        batch_size, skipgrams, neg_sampling_probs, num_negatives=5
    )
    centers = centers.to(device)
    positives = positives.to(device)
    negatives = negatives.to(device)

    optimizer_ns.zero_grad()
    loss = model_ns(centers, positives, negatives)
    loss.backward()
    optimizer_ns.step()

    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1}, loss = {loss.item():.4f}")

Skipgrams: 868334 pairs
Training Word2Vec (Negative Sampling)...
Epoch 500, loss = 20.1012
Epoch 1000, loss = 19.4003
Epoch 1500, loss = 17.9985
Epoch 2000, loss = 15.3495


In [25]:
# 7. TRAINING (GloVe)

cooc = build_cooccurrence(corpus, window_size)
weightingdic = {(w1, w2): weighting(w1, w2, cooc) for (w1, w2) in cooc}
print(f"Co-occurrence pairs: {len(cooc)}")

emb_size = 100
batch_size = 512 # Increase batch_size to prevent high varience
num_epochs = 2500

model_glove = GloVeModel(voc_size, emb_size).to(device)
optimizer_glove = optim.Adam(model_glove.parameters(), lr=0.005) # Might reduce learning rate to prevent overshoot minimum

print("Training GloVe...")
for epoch in range(num_epochs):
    input_b, target_b, cooc_b, weight_b = randombatch_glove(batch_size, cooc, weightingdic)
    input_b = torch.from_numpy(input_b).to(device)
    target_b = torch.from_numpy(target_b).to(device)
    cooc_b = torch.from_numpy(cooc_b).to(device)
    weight_b = torch.from_numpy(weight_b).to(device)

    optimizer_glove.zero_grad()
    loss = model_glove(input_b.unsqueeze(1), target_b.unsqueeze(1), cooc_b.unsqueeze(1), weight_b.unsqueeze(1))
    loss.backward()
    optimizer_glove.step()

    if (epoch + 1) % 500 == 0:
        print(f"GloVe Epoch {epoch+1}, loss = {loss.item():.4f}")

print("Training completed! Both Word2Vec (NS) and GloVe ready.")

Co-occurrence pairs: 344434
Training GloVe...
GloVe Epoch 500, loss = 7011.0259
GloVe Epoch 1000, loss = 5438.5010
GloVe Epoch 1500, loss = 3306.8403
GloVe Epoch 2000, loss = 3165.7822
GloVe Epoch 2500, loss = 3275.2668
Training completed! Both Word2Vec (NS) and GloVe ready.


### **TASK 2: Model Comparison**

In [26]:
# Training Loss/Time Table

import time
from datetime import datetime

# ‡∏ß‡∏±‡∏î‡πÄ‡∏ß‡∏•‡∏≤‡πÄ‡∏ó‡∏£‡∏ô (‡∏™‡∏°‡∏°‡∏ï‡∏¥‡πÄ‡∏ó‡∏£‡∏ô‡πÉ‡∏´‡∏°‡πà ‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏ä‡πâ‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á)
def train_and_time(model_class, train_fn, name, epochs=1000):
    start_time = time.time()
    model = model_class(voc_size, emb_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.005)
    
    for epoch in range(epochs):
        # train_fn() = ‡∏™‡∏∏‡πà‡∏° batch + train step
        loss = train_fn(model, optimizer)
    
    train_time = time.time() - start_time
    final_loss = loss.item()
    return model, final_loss, train_time

# Skip-gram NS (‡∏à‡∏≤‡∏Å Task 1)
def ns_train_step(model, optimizer):
    centers, positives, negatives = randombatch_with_neg(batch_size, skipgrams, neg_sampling_probs)
    centers, positives, negatives = centers.to(device), positives.to(device), negatives.to(device)
    optimizer.zero_grad()
    loss = model(centers, positives, negatives)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    return loss

# GloVe train step (‡∏à‡∏≤‡∏Å Task 1)
def glove_train_step(model, optimizer):
    input_b, target_b, cooc_b, weight_b = randombatch_glove(batch_size, cooc, weightingdic)
    input_b = torch.LongTensor(input_b).unsqueeze(1).to(device)
    target_b = torch.LongTensor(target_b).unsqueeze(1).to(device)
    cooc_b = torch.FloatTensor(cooc_b).unsqueeze(1).to(device)
    weight_b = torch.FloatTensor(weight_b).unsqueeze(1).to(device)
    optimizer.zero_grad()
    loss = model(input_b, target_b, cooc_b, weight_b)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    optimizer.step()
    return loss

# Table 1: Training Loss/Time (window_size=2)
results = {
    'Skip-gram (NS)': train_and_time(SkipgramNegSampling, ns_train_step, 'Skip-gram (NS)'),
    'GloVe': train_and_time(GloVeModel, glove_train_step, 'GloVe')
}

print("## Table 1: Training Results (Window=2)")
print("| Model | Window Size | Training Loss | Training Time (s) |")
print("|-------|-------------|---------------|-------------------|")
for name, (model, loss, time_s) in results.items():
    print(f"| {name} | 2 | {loss:.2f} | {time_s:.1f} |")



## Table 1: Training Results (Window=2)
| Model | Window Size | Training Loss | Training Time (s) |
|-------|-------------|---------------|-------------------|
| Skip-gram (NS) | 2 | 7.95 | 56.6 |
| GloVe | 2 | 4595.22 | 14.4 |


In [27]:
# Intrinsic Eval: Syntactic + Semantic Accuracy (Google Analogy Test Set)

# Download Google analogy dataset
import requests
url = "https://github.com/tomsercu/lstm/raw/master/data/questions-words.txt"
response = requests.get(url)
lines = response.text.splitlines()

def parse_analogies(lines):
    syntactic, semantic = [], []
    current_category = None  # ‡πÄ‡∏Å‡πá‡∏ö category ‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô
    
    for line in lines:
        line = line.strip()
        if line.startswith(':'):
            current_category = line[2:]  # ‡∏≠‡∏±‡∏û‡πÄ‡∏î‡∏ó category
            continue
        if line and current_category:  # ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏Ñ‡∏≥ + category
            words = line.split()
            if len(words) == 4:
                if 'gram' in current_category:  # syntactic categories
                    syntactic.append(words)
                elif 'cat' in current_category:  # semantic categories
                    semantic.append(words)
    
    return syntactic[:5000], semantic[:5000]

syntactic, semantic = parse_analogies(lines)
print(f"Syntactic: {len(syntactic)} questions")
print(f"Semantic: {len(semantic)} questions")


def get_embedding(model, word):
    """‡∏î‡∏∂‡∏á embedding ‡∏à‡∏≤‡∏Å model"""
    if word not in word2index:
        return None
    idx = word2index[word]
    with torch.no_grad():
        v_emb = model.v_embed(torch.tensor([idx]).to(device))
        if hasattr(model, 'u_embed'):
            u_emb = model.u_embed(torch.tensor([idx]).to(device))
            emb = (v_emb + u_emb) / 2
        else:
            emb = v_emb
        return emb.cpu().numpy().squeeze()

def analogy_accuracy(model, analogies):
    """‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì accuracy: a:b :: c:? ‚Üí argmax_d cos(b-a+c, d)"""
    correct = 0
    total = 0
    for a, b, c, d in analogies:
        emb_a = get_embedding(model, a)
        emb_b = get_embedding(model, b)
        emb_c = get_embedding(model, c)
        emb_d = get_embedding(model, d)
        
        if emb_a is None or emb_b is None or emb_c is None or emb_d is None:
            continue
            
        # Vector: b - a + c
        vec = emb_b - emb_a + emb_c
        
        # ‡∏´‡∏≤ word ‡∏ó‡∏µ‡πà cos sim ‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏° a,b,c)
        best_word = None
        best_sim = -1
        for word in vocab[:5000]:  # test 5000 ‡∏Ñ‡∏≥‡πÄ‡∏£‡πá‡∏ß
            if word in [a, b, c]:
                continue
            emb_w = get_embedding(model, word)
            if emb_w is not None:
                sim = np.dot(vec, emb_w) / (np.linalg.norm(vec) * np.linalg.norm(emb_w))
                if sim > best_sim:
                    best_sim = sim
                    best_word = word
        
        if best_word == d:
            correct += 1
        total += 1
    
    return correct / total * 100 if total > 0 else 0

# ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì accuracy
syn_ns = analogy_accuracy(model_ns, syntactic)
sem_ns = analogy_accuracy(model_ns, semantic)
syn_glove = analogy_accuracy(model_glove, syntactic)
sem_glove = analogy_accuracy(model_glove, semantic)

print("\n## Table 2: Intrinsic Evaluation (Google Analogy)")
print("| Model | Window Size | Syntactic Acc (%) | Semantic Acc (%) |")
print("|-------|-------------|-------------------|------------------|")
print(f"| Skip-gram (NS) | 2 | {syn_ns:.2f} | {sem_ns:.2f} |")
print(f"| GloVe | 2 | {syn_glove:.2f} | {sem_glove:.2f} |")



Syntactic: 0 questions
Semantic: 0 questions

## Table 2: Intrinsic Evaluation (Google Analogy)
| Model | Window Size | Syntactic Acc (%) | Semantic Acc (%) |
|-------|-------------|-------------------|------------------|
| Skip-gram (NS) | 2 | 0.00 | 0.00 |
| GloVe | 2 | 0.00 | 0.00 |


In [28]:
import pandas as pd
import numpy as np
from scipy import stats

# üìÅ ‡πÇ‡∏´‡∏•‡∏î WordSim-353 ‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå local 
wsim_df = pd.read_csv("../dataset/combined.csv")  # ‡∏£‡∏±‡∏ô‡πÉ‡∏ô folder ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå
wsim_data = []
for _, row in wsim_df.iterrows():
    w1, w2, score = row['Word 1'].lower(), row['Word 2'].lower(), float(row['Human (mean)'])
    wsim_data.append((w1, w2, score))

print(f"‚úÖ Loaded {len(wsim_data)} WordSim-353 pairs from combined.csv")
print(f"‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á: {wsim_data[:3]}")

‚úÖ Loaded 353 WordSim-353 pairs from combined.csv
‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á: [('love', 'sex', 6.77), ('tiger', 'cat', 7.35), ('tiger', 'tiger', 10.0)]


In [29]:
# Spearman correlation function (‡πÉ‡∏ä‡πâ get_embedding ‡∏à‡∏≤‡∏Å Task 2)
def spearman_correlation(model, word_pairs):
    model_sims = []
    human_sims = []
    for w1, w2, human_sim in word_pairs:
        emb1 = get_embedding(model, w1)
        emb2 = get_embedding(model, w2)
        if emb1 is not None and emb2 is not None:
            sim = np.dot(emb1, emb2)  # ‡πÉ‡∏ä‡πâ dot product ‡∏ï‡∏≤‡∏°‡πÇ‡∏à‡∏ó‡∏¢‡πå
            model_sims.append(sim)
            human_sims.append(human_sim)

    valid_pairs = len(model_sims)
    if valid_pairs < 2:
        return 0.0, valid_pairs
    corr = stats.spearmanr(model_sims, human_sims)[0]
    return corr, valid_pairs


# ‡∏£‡∏±‡∏ô correlation (‡πÉ‡∏ä‡πâ model_ns, model_glove ‡∏à‡∏≤‡∏Å Task 1)
corr_ns, n_ns = spearman_correlation(model_ns, wsim_data)
corr_glove, n_glove = spearman_correlation(model_glove, wsim_data)

print("\n## Table 3: WordSim-353 Spearman (combined.csv)")
print("| Model          | Spearman Corr | Valid Pairs |")
print("|----------------|---------------|-------------|")
print(f"| Skip-gram (NS) | {corr_ns:.3f} | {n_ns} |")
print(f"| GloVe          | {corr_glove:.3f} | {n_glove} |")



## Table 3: WordSim-353 Spearman (combined.csv)
| Model          | Spearman Corr | Valid Pairs |
|----------------|---------------|-------------|
| Skip-gram (NS) | -0.061 | 181 |
| GloVe          | 0.131 | 181 |


### **2.3 Word Similarity Evaluation**

- For the similarity evaluation, the WordSim-353 dataset (combined.csv) was used as the gold-standard similarity resource. For each word pair \((w_1, w_2)\) in the dataset, the similarity predicted by each model was computed as the **dot product** between the corresponding word embeddings. The human similarity scores provided in WordSim-353 were then compared with the model-predicted similarities using the **Spearman rank correlation coefficient** implemented in `scipy.stats.spearmanr`.  

- In this setup, the Skip-gram with negative sampling model achieved a Spearman correlation of approximately **-0.10**, while the GloVe model achieved a correlation of approximately **0.01**, based on **181 word pairs** for which both words were present in the models‚Äô vocabularies. These low correlation values indicate that, with the relatively small training corpus used in this assignment, the learned embeddings do **not yet capture human-like word similarity judgments effectively**.

### **Export Models, Vocabs, Context Embeddings**

In [30]:
# Save Skip-gram (NEG)
MODEL_WORD2VEC = "skipgram_ns.pt"
torch.save(model_ns.state_dict(), MODEL_WORD2VEC)
print("Saved:", MODEL_WORD2VEC)

# Save GloVe
MODEL_GLOVE = "glove_model.pt"
torch.save(model_glove.state_dict(), MODEL_GLOVE)
print("Saved:", MODEL_GLOVE)

Saved: skipgram_ns.pt
Saved: glove_model.pt


In [31]:
# word2index, vocab using pickle 
import pickle

with open("word2index.pkl", "wb") as f:
    pickle.dump(word2index, f)

with open("index2word.pkl", "wb") as f:
    pickle.dump(index2word, f)

In [34]:
# corpus sentenses
# corpus ‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡∏Ñ‡∏∑‡∏≠ List[List[str]] ‡∏à‡∏≤‡∏Å load_reuters_corpus()

print(type(corpus), len(corpus))  # ‡πÄ‡∏ä‡πá‡∏Å‡πÄ‡∏•‡πà‡∏ô ‡πÜ

with open("corpus_sentences.pkl", "wb") as f:
    pickle.dump(corpus, f)

print("Saved corpus_sentences.pkl")

<class 'list'> 10007
Saved corpus_sentences.pkl


#### `‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏π‡πâ‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≤‡∏ô‡πÅ‡∏ö‡∏ö‡πÑ‡∏°‡πà‡∏à‡∏° syntax ‡πÑ‡∏õ‡∏ù‡∏∂‡∏Å‡∏°‡∏≤‡∏î‡πâ‡∏ß‡∏¢`