# Word2Vec

## 1.1 Word2Vec Implementation
Task:
- CBOW / Skip-gram using gensim
- embedding dimensions 100,300,500
- subset of wikipedia data -> perform preprocessing
- evaluation using WordSim353 dataset, computer Spearman's correlation coefficient between embeddings and WordSim353 dataset

### Preprocess the dataset

In [1]:
import os
import string
import bz2
import logging
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import spearmanr
import numpy as np
import re
from nltk.corpus import stopwords

# Enable logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def load_wikipedia_data(file_path):
    """
    Load and preprocess the Wikipedia subset from a bz2-compressed file.
    Preprocessing includes:
      - Lowercasing the text
      - Sentence tokenization
      - Word tokenization
      - Removal of punctuation, stopwords, and numbers
    """
    # Load the stopwords list for English
    stop_words = set(stopwords.words('english'))
    
    with bz2.open(file_path, 'rt', encoding='utf-8', errors='replace') as f:
        text = f.read()
    
    # Lowercase the text
    text = text.lower()
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    # Prepare translation table to remove punctuation
    table = str.maketrans('', '', string.punctuation)
    
    processed_sentences = []
    for sent in sentences:
        tokens = word_tokenize(sent)
        # Remove punctuation from each token
        tokens = [w.translate(table) for w in tokens]
        # Remove empty tokens, stopwords, and tokens containing digits
        tokens = [w for w in tokens if w and w not in stop_words and not re.search(r'\d', w)]
        processed_sentences.append(tokens)
    
    return processed_sentences


def load_wordsim353(file_path):
    """
    Load the WordSim353 dataset.
    Expected format: each line contains word1, word2, and human similarity score, separated by tabs.
    """
    word_pairs = []
    gold_scores = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Skip header if exists
            if line.strip() == "" or "word1" in line.lower():
                continue
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                w1, w2, score = parts[0], parts[1], parts[2]
                try:
                    score = float(score)
                    word_pairs.append((w1.lower(), w2.lower()))
                    gold_scores.append(score)
                except ValueError:
                    continue
    return word_pairs, gold_scores

def evaluate_model(model, word_pairs, gold_scores):
    """
    Evaluate the Word2Vec model on the WordSim353 dataset.
    For each word pair, if both words are in the vocabulary, compute cosine similarity.
    Then compute Spearman's correlation coefficient between the computed and gold scores.
    """
    computed_scores = []
    valid_gold_scores = []
    
    for (w1, w2), gold in zip(word_pairs, gold_scores):
        if w1 in model.wv.key_to_index and w2 in model.wv.key_to_index:
            sim = model.wv.similarity(w1, w2)
            computed_scores.append(sim)
            valid_gold_scores.append(gold)
    
    if computed_scores:
        correlation, _ = spearmanr(valid_gold_scores, computed_scores)
    else:
        correlation = None
    return correlation

def main():
    # Paths to data files
    wiki_file = "enwiki-20241201-pages-articles-multistream1.xml-p1p41242.bz2"  # Your Wikipedia subset file path
    wordsim_file = "wordsim353.txt"       # Your WordSim353 file path

    # Load and preprocess Wikipedia data
    print("Loading and preprocessing Wikipedia data...")
    sentences = load_wikipedia_data(wiki_file)
    print(f"Loaded {len(sentences)} sentences.")
    
    # Load WordSim353 dataset
    word_pairs, gold_scores = load_wordsim353(wordsim_file)
    print(f"Loaded {len(word_pairs)} word pairs from WordSim353.")

    # Define the embedding dimensions to experiment with
    embedding_dims = [100, 300, 500]
    evaluation_results = {}

    # Train a Word2Vec model for each embedding dimension
    for dim in embedding_dims:
        print(f"\nTraining Word2Vec model with embedding dimension: {dim}")
        # Using CBOW model: sg=0 (for Skip-gram use sg=1)
        model = Word2Vec(
            sentences,
            vector_size=dim,
            window=5,
            min_count=2,
            workers=4,
            sg=0  # 0 for CBOW; change to 1 for Skip-gram
        )
        
        # Save the model to disk
        model_filename = f"word2vec_cbow_{dim}d.model"
        model.save(model_filename)
        print(f"Model saved as {model_filename}")
        
        # Evaluate the model on the WordSim353 dataset
        correlation = evaluate_model(model, word_pairs, gold_scores)
        if correlation is not None:
            print(f"Spearman's correlation for dimension {dim}: {correlation:.4f}")
            evaluation_results[dim] = correlation
        else:
            print(f"No valid word pairs found for evaluation with dimension {dim}.")
    
    # Print summary of evaluations
    print("\nSummary of Spearman's correlations:")
    for dim, corr in evaluation_results.items():
        print(f"Dimension {dim}: {corr:.4f}")

if __name__ == "__main__":
    main()


Loading and preprocessing Wikipedia data...


2025-03-16 00:19:01,072 : INFO : collecting all words and their counts
2025-03-16 00:19:01,075 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-03-16 00:19:01,101 : INFO : PROGRESS: at sentence #10000, processed 261861 words, keeping 39978 word types
2025-03-16 00:19:01,139 : INFO : PROGRESS: at sentence #20000, processed 548952 words, keeping 66196 word types
2025-03-16 00:19:01,190 : INFO : PROGRESS: at sentence #30000, processed 858880 words, keeping 94114 word types
2025-03-16 00:19:01,233 : INFO : PROGRESS: at sentence #40000, processed 1163286 words, keeping 115624 word types


Loaded 2886887 sentences.
Loaded 353 word pairs from WordSim353.

Training Word2Vec model with embedding dimension: 100


2025-03-16 00:19:01,275 : INFO : PROGRESS: at sentence #50000, processed 1479668 words, keeping 139099 word types
2025-03-16 00:19:01,317 : INFO : PROGRESS: at sentence #60000, processed 1742833 words, keeping 156163 word types
2025-03-16 00:19:01,359 : INFO : PROGRESS: at sentence #70000, processed 2006401 words, keeping 174614 word types
2025-03-16 00:19:01,407 : INFO : PROGRESS: at sentence #80000, processed 2293153 words, keeping 195018 word types
2025-03-16 00:19:01,449 : INFO : PROGRESS: at sentence #90000, processed 2546917 words, keeping 211008 word types
2025-03-16 00:19:01,491 : INFO : PROGRESS: at sentence #100000, processed 2806444 words, keeping 228455 word types
2025-03-16 00:19:01,541 : INFO : PROGRESS: at sentence #110000, processed 3147452 words, keeping 248548 word types
2025-03-16 00:19:01,583 : INFO : PROGRESS: at sentence #120000, processed 3439093 words, keeping 262130 word types
2025-03-16 00:19:01,647 : INFO : PROGRESS: at sentence #130000, processed 3769920 wor

Model saved as word2vec_cbow_100d.model
Spearman's correlation for dimension 100: 0.6663

Training Word2Vec model with embedding dimension: 300


2025-03-16 00:23:10,680 : INFO : PROGRESS: at sentence #40000, processed 1163286 words, keeping 115624 word types
2025-03-16 00:23:10,739 : INFO : PROGRESS: at sentence #50000, processed 1479668 words, keeping 139099 word types
2025-03-16 00:23:10,774 : INFO : PROGRESS: at sentence #60000, processed 1742833 words, keeping 156163 word types
2025-03-16 00:23:10,814 : INFO : PROGRESS: at sentence #70000, processed 2006401 words, keeping 174614 word types
2025-03-16 00:23:10,864 : INFO : PROGRESS: at sentence #80000, processed 2293153 words, keeping 195018 word types
2025-03-16 00:23:10,912 : INFO : PROGRESS: at sentence #90000, processed 2546917 words, keeping 211008 word types
2025-03-16 00:23:10,951 : INFO : PROGRESS: at sentence #100000, processed 2806444 words, keeping 228455 word types
2025-03-16 00:23:11,007 : INFO : PROGRESS: at sentence #110000, processed 3147452 words, keeping 248548 word types
2025-03-16 00:23:11,054 : INFO : PROGRESS: at sentence #120000, processed 3439093 word

Model saved as word2vec_cbow_300d.model
Spearman's correlation for dimension 300: 0.6842

Training Word2Vec model with embedding dimension: 500


2025-03-16 00:30:27,991 : INFO : PROGRESS: at sentence #50000, processed 1479668 words, keeping 139099 word types
2025-03-16 00:30:28,024 : INFO : PROGRESS: at sentence #60000, processed 1742833 words, keeping 156163 word types
2025-03-16 00:30:28,067 : INFO : PROGRESS: at sentence #70000, processed 2006401 words, keeping 174614 word types
2025-03-16 00:30:28,119 : INFO : PROGRESS: at sentence #80000, processed 2293153 words, keeping 195018 word types
2025-03-16 00:30:28,160 : INFO : PROGRESS: at sentence #90000, processed 2546917 words, keeping 211008 word types
2025-03-16 00:30:28,206 : INFO : PROGRESS: at sentence #100000, processed 2806444 words, keeping 228455 word types
2025-03-16 00:30:28,259 : INFO : PROGRESS: at sentence #110000, processed 3147452 words, keeping 248548 word types
2025-03-16 00:30:28,302 : INFO : PROGRESS: at sentence #120000, processed 3439093 words, keeping 262130 word types
2025-03-16 00:30:28,351 : INFO : PROGRESS: at sentence #130000, processed 3769920 wor

Model saved as word2vec_cbow_500d.model
Spearman's correlation for dimension 500: 0.6863

Summary of Spearman's correlations:
Dimension 100: 0.6663
Dimension 300: 0.6842
Dimension 500: 0.6863


## Improvement

In [2]:
## External knowledge

import os
import string
import logging
from gensim.models import Word2Vec
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import spearmanr
import numpy as np
import re

# Enable logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def get_synonyms(word):
    """
    Extract a set of synonyms for the given word using WordNet.
    """
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            # Replace underscores with spaces and lowercase the result
            synonyms.add(lemma.name().lower().replace('_', ' '))
    return synonyms

def retrofit_embeddings(model, alpha=0.5, iterations=10):
    """
    Retrofit the embeddings using synonyms from WordNet.
    For each word in the vocabulary, adjust its vector to be closer to the average of its synonyms.
    
    Parameters:
    - model: The pre-trained Word2Vec model.
    - alpha: Weight for the original embedding (0 < alpha < 1).
             Lower values pull the embedding closer to its synonyms.
    - iterations: Number of retrofitting iterations.
    
    Returns:
    - The retrofitted model.
    """
    # Create a copy of the original embeddings
    new_embeddings = {word: model.wv[word].copy() for word in model.wv.index_to_key}
    
    for it in range(iterations):
        for word in model.wv.index_to_key:
            synonyms = get_synonyms(word)
            # Only consider synonyms that are in the model's vocabulary and avoid self-reference
            valid_synonyms = [w for w in synonyms if w in new_embeddings and w != word]
            if not valid_synonyms:
                continue
            # Compute the average vector of the valid synonyms
            syn_vector = np.mean([new_embeddings[w] for w in valid_synonyms], axis=0)
            # Update: combine the original embedding and the synonyms' average
            new_embeddings[word] = alpha * model.wv[word] + (1 - alpha) * syn_vector
    
    # Update the model's embeddings with the retrofitted ones
    for word in new_embeddings:
        model.wv[word] = new_embeddings[word]
    
    return model

def load_wordsim353(file_path):
    """
    Load the WordSim353 dataset.
    Expected format: each line contains word1, word2, and human similarity score, separated by tabs.
    """
    word_pairs = []
    gold_scores = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "" or "word1" in line.lower():
                continue
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                w1, w2, score = parts[0], parts[1], parts[2]
                try:
                    score = float(score)
                    word_pairs.append((w1.lower(), w2.lower()))
                    gold_scores.append(score)
                except ValueError:
                    continue
    return word_pairs, gold_scores

def evaluate_model(model, word_pairs, gold_scores):
    """
    Evaluate the Word2Vec model on the WordSim353 dataset.
    For each word pair, if both words are in the vocabulary, compute cosine similarity.
    Then compute Spearman's correlation coefficient between the computed and gold scores.
    """
    computed_scores = []
    valid_gold_scores = []
    
    for (w1, w2), gold in zip(word_pairs, gold_scores):
        if w1 in model.wv.key_to_index and w2 in model.wv.key_to_index:
            sim = model.wv.similarity(w1, w2)
            computed_scores.append(sim)
            valid_gold_scores.append(gold)
    
    if computed_scores:
        correlation, _ = spearmanr(valid_gold_scores, computed_scores)
    else:
        correlation = None
    return correlation

def main():
    # Specify paths for the WordSim353 dataset
    wordsim_file = "wordsim353.txt"  # Adjust if needed
    word_pairs, gold_scores = load_wordsim353(wordsim_file)
    
    # List of embedding dimensions to process
    embedding_dims = [100, 300, 500]
    
    # Dictionaries to store evaluation results for base and retrofitted models
    base_evaluation = {}
    retro_evaluation = {}
    
    for dim in embedding_dims:
        base_model_filename = f"word2vec_cbow_{dim}d.model"
        retro_model_filename = f"word2vec_cbow_{dim}d_retrofitted.model"
        
        print(f"\nProcessing {dim}-dimensional model...")
        
        # Load the pre-trained base model
        print(f"Loading base model from {base_model_filename} ...")
        model = Word2Vec.load(base_model_filename)
        
        # Evaluate the base model
        base_corr = evaluate_model(model, word_pairs, gold_scores)
        if base_corr is not None:
            print(f"Spearman's correlation (base): {base_corr:.4f}")
            base_evaluation[dim] = base_corr
        else:
            print("No valid word pairs found for evaluation in the base model.")
        
        # Retrofit the model using WordNet synonyms
        print("Retrofitting the model using WordNet synonyms...")
        retro_model = retrofit_embeddings(model, alpha=0.5, iterations=10)
        
        # Save the retrofitted model
        retro_model.save(retro_model_filename)
        print(f"Retrofitted model saved as {retro_model_filename}")
        
        # Evaluate the retrofitted model
        retro_corr = evaluate_model(retro_model, word_pairs, gold_scores)
        if retro_corr is not None:
            print(f"Spearman's correlation (retrofitted): {retro_corr:.4f}")
            retro_evaluation[dim] = retro_corr
        else:
            print("No valid word pairs found for evaluation in the retrofitted model.")
    
    # Print a summary of evaluations
    print("\nSummary of Spearman's correlations:")
    for dim in embedding_dims:
        base = base_evaluation.get(dim, 0)
        retro = retro_evaluation.get(dim, 0)
        print(f"Dimension {dim}: Base = {base:.4f} | Retrofitted = {retro:.4f}")

if __name__ == "__main__":
    main()


2025-03-16 01:05:38,341 : INFO : loading Word2Vec object from word2vec_cbow_100d.model



Processing 100-dimensional model...
Loading base model from word2vec_cbow_100d.model ...


2025-03-16 01:05:38,868 : INFO : loading wv recursively from word2vec_cbow_100d.model.wv.* with mmap=None
2025-03-16 01:05:38,869 : INFO : loading vectors from word2vec_cbow_100d.model.wv.vectors.npy with mmap=None
2025-03-16 01:05:39,062 : INFO : loading syn1neg from word2vec_cbow_100d.model.syn1neg.npy with mmap=None
2025-03-16 01:05:39,169 : INFO : setting ignored attribute cum_table to None
2025-03-16 01:05:43,780 : INFO : Word2Vec lifecycle event {'fname': 'word2vec_cbow_100d.model', 'datetime': '2025-03-16T01:05:43.779960', 'gensim': '4.3.3', 'python': '3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]', 'platform': 'Linux-6.11.0-19-generic-x86_64-with-glibc2.39', 'event': 'loaded'}


Spearman's correlation (base): 0.6663
Retrofitting the model using WordNet synonyms...


KeyboardInterrupt: 

# Pretrained Model Embedding generation
