<a href="https://colab.research.google.com/github/Mananpatel25/nlp-assignments/blob/main/NLP_2_1_AND_2_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install wget
from gensim.models import KeyedVectors
import numpy as np
from scipy.spatial.distance import cosine
from collections import defaultdict
import os
import gzip
import wget

def download_embeddings():
    print("Downloading embedding models if not present...")


    lexvec_url = "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1"
    if not os.path.exists('lexvec.txt'):
        if not os.path.exists('lexvec.txt.gz'):
            print("Downloading LexVec...")
            wget.download(lexvec_url, 'lexvec.txt.gz')
        with gzip.open('lexvec.txt.gz', 'rb') as f_in:
            with open('lexvec.txt', 'wb') as f_out:
                f_out.write(f_in.read())

    # FastText reduced
    fasttext_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
    if not os.path.exists('fasttext-cc-reduced.vec'):
        if not os.path.exists('cc.en.300.vec.gz'):
            print("Downloading FastText CC reduced...")
            wget.download(fasttext_url, 'cc.en.300.vec.gz')
        # We'll load only the first 200k vectors to keep it lightweight
        with gzip.open('cc.en.300.vec.gz', 'rt', encoding='utf-8') as f_in:
            with open('fasttext-cc-reduced.vec', 'w', encoding='utf-8') as f_out:
                header = next(f_in)
                vocab_size, dim = map(int, header.split())
                f_out.write(f"200000 {dim}\n")  # Reduced vocabulary size
                for i, line in enumerate(f_in):
                    if i >= 200000:
                        break
                    f_out.write(line)

def load_embeddings():
    """Load the embedding models"""
    print("Loading embedding models...")

    # Download if not present
    download_embeddings()

    lexvec = KeyedVectors.load_word2vec_format('lexvec.txt', binary=False)

    # Load reduced FastText
    fasttext = KeyedVectors.load_word2vec_format('fasttext-cc-reduced.vec', binary=False)

    return lexvec, fasttext

def cosine_similarity(v1, v2):
    """Calculate cosine similarity between two vectors"""
    return 1 - cosine(v1, v2)

def solve_analogy(a, b, c, vocab, embeddings):
    """
    Solve analogy a:b :: c:x using the method from Mikolov et al. 2013
    Returns the most similar word x and its similarity score
    """
    try:
        # Get embeddings for words
        a_vec = embeddings[a]
        b_vec = embeddings[b]
        c_vec = embeddings[c]

        # Calculate target vector
        target = b_vec - a_vec + c_vec

        # Find most similar word (excluding a, b, c)
        best_word = None

        # Use gensim's optimized most_similar method
        results = embeddings.most_similar(positive=[b, c], negative=[a], topn=5)
        for word, score in results:
            if word not in [a, b, c]:
                best_word = word
                break

        return best_word, score
    except KeyError:
        return None, 0.0

def evaluate_analogies(filename, embeddings, embedding_name):
    """
    Evaluate embedding model on analogy dataset
    Returns accuracy per category and overall accuracy
    """
    results = defaultdict(lambda: {'correct': 0, 'total': 0})
    current_category = None

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith(':'):
                current_category = line.strip()[2:]  # Remove ': ' prefix
                continue

            if current_category not in [
                'capital-world', 'currency', 'city-in-state', 'family',
                'gram1-adjective-to-adverb', 'gram2-opposite',
                'gram3-comparative', 'gram6-nationality-adjective'
            ]:
                continue

            words = line.strip().split()
            if len(words) != 4:
                continue

            a, b, c, expected = words
            predicted, _ = solve_analogy(a, b, c, embeddings.key_to_index, embeddings)

            if predicted == expected:
                results[current_category]['correct'] += 1
            results[current_category]['total'] += 1

    # Calculate accuracies
    print(f"\nResults for {embedding_name}:")
    total_correct = 0
    total_questions = 0

    for category in results:
        correct = results[category]['correct']
        total = results[category]['total']
        accuracy = (correct / total * 100) if total > 0 else 0
        print(f"{category}: {accuracy:.2f}% ({correct}/{total})")
        total_correct += correct
        total_questions += total

    overall_accuracy = (total_correct / total_questions * 100) if total_questions > 0 else 0
    print(f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_questions})")
    return results

def custom_analogy_test(embeddings, embedding_name):
    """
    Test custom analogy questions with carefully chosen relations
    """
    custom_analogies = [
        # Type 1: Emotional States (testing conceptual understanding)
        ['happy', 'smile', 'angry', 'frown'],
        ['joy', 'laugh', 'sorrow', 'cry'],
        ['love', 'hug', 'hate', 'fight'],

        # Type 2: Natural Phenomena (testing scientific relationships)
        ['cloud', 'rain', 'volcano', 'lava'],
        ['wind', 'blow', 'water', 'flow'],
        ['seed', 'plant', 'egg', 'bird']
    ]

    print(f"\nCustom Analogy Results for {embedding_name}:")
    correct = 0
    total = len(custom_analogies)

    for analogy in custom_analogies:
        a, b, c, expected = analogy
        predicted, score = solve_analogy(a, b, c, embeddings.key_to_index, embeddings)
        is_correct = predicted == expected
        if is_correct:
            correct += 1
        print(f"{a}:{b} :: {c}:{expected} -> Predicted: {predicted} (Correct: {is_correct})")

    accuracy = (correct / total * 100)
    print(f"Custom analogy accuracy: {accuracy:.2f}% ({correct}/{total})")
    return accuracy

def main():
    # Load embeddings
    lexvec, fasttext = load_embeddings()

    # Evaluate on standard analogies
    print("\nEvaluating standard analogies...")
    lexvec_results = evaluate_analogies('word-test.v1.txt', lexvec, 'LexVec')
    fasttext_results = evaluate_analogies('word-test.v1.txt', fasttext, 'FastText CC reduced')

    # Evaluate on custom analogies
    print("\nEvaluating custom analogies...")
    lexvec_custom = custom_analogy_test(lexvec, 'LexVec')
    fasttext_custom = custom_analogy_test(fasttext, 'FastText CC reduced')

if __name__ == "__main__":
    main()

Loading embedding models...
Downloading embedding models if not present...
Downloading LexVec...

Evaluating standard analogies...

Results for LexVec:
capital-world: 0.00% (0/4524)
currency: 0.00% (0/866)
city-in-state: 0.00% (0/2467)
family: 87.75% (444/506)
gram1-adjective-to-adverb: 24.90% (247/992)
gram2-opposite: 36.58% (297/812)
gram3-comparative: 87.31% (1163/1332)
gram6-nationality-adjective: 0.00% (0/1599)
Overall accuracy: 16.42% (2151/13098)

Results for FastText CC reduced:
capital-world: 95.03% (4299/4524)
currency: 15.94% (138/866)
city-in-state: 82.53% (2036/2467)
family: 91.11% (461/506)
gram1-adjective-to-adverb: 34.07% (338/992)
gram2-opposite: 47.17% (383/812)
gram3-comparative: 92.87% (1237/1332)
gram6-nationality-adjective: 91.31% (1460/1599)
Overall accuracy: 79.03% (10352/13098)

Evaluating custom analogies...

Custom Analogy Results for LexVec:
happy:smile :: angry:frown -> Predicted: smirk (Correct: False)
joy:laugh :: sorrow:cry -> Predicted: chuckle (Correct