In [42]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
NVIDIA GeForce RTX 4060 Laptop GPU


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import brown
from collections import Counter
import numpy as np
import random


In [44]:
# Download the Brown corpus if not already downloaded
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /home/mohak/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [45]:
# Preprocessing
sentences = brown.sents()[:]  # Use only the first 1000 sentences for quick testing
corpus = [word.lower() for sentence in sentences for word in sentence]
vocab = set(corpus)
vocab_size = len(vocab)

In [46]:
# Create word to index and index to word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [47]:
# Generate training data (CBOW model)
def generate_cbow_data(corpus, window_size=2):
    data = []
    for idx, word in enumerate(corpus):
        if idx < window_size or idx >= len(corpus) - window_size:
            continue
        context = [word_to_index[corpus[i]] for i in range(idx - window_size, idx + window_size + 1) if i != idx]
        target = word_to_index[word]
        data.append((context, target))
    return data

window_size = 2
training_data = generate_cbow_data(corpus, window_size)

In [48]:
# Define the Word2Vec model
class Word2VecCBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecCBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context).mean(dim=1)
        output = self.output(embedded)
        return output

In [49]:
# Hyperparameters
embedding_dim = 100
epochs = 10
batch_size = 256
learning_rate = 0.001

In [50]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [51]:
# Create model, loss function, and optimizer
model = Word2VecCBOW(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [52]:
# Prepare batches for training
def get_batches(training_data, batch_size):
    random.shuffle(training_data)
    for i in range(0, len(training_data), batch_size):
        batch = training_data[i:i + batch_size]
        contexts, targets = zip(*batch)
        yield torch.tensor(contexts, dtype=torch.long), torch.tensor(targets, dtype=torch.long)


In [53]:
# Training loop
for epoch in range(epochs):
    total_loss = 0
    for contexts, targets in get_batches(training_data, batch_size):
        contexts, targets = contexts.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(contexts)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 31765.8421
Epoch 2, Loss: 27800.9517
Epoch 3, Loss: 26427.7473
Epoch 4, Loss: 25440.8100
Epoch 5, Loss: 24649.0425
Epoch 6, Loss: 23978.3590
Epoch 7, Loss: 23394.0538
Epoch 8, Loss: 22873.5845
Epoch 9, Loss: 22402.0419
Epoch 10, Loss: 21972.7009


In [54]:
# Save the word embeddings
word_embeddings = model.embeddings.weight.detach().cpu().numpy()

In [55]:
# # Example usage: find similar words
# def find_similar_words(word, word_embeddings, top_n=5):
#     if word not in word_to_index:
#         return []
#     idx = word_to_index[word]
#     word_vec = word_embeddings[idx]
#     similarities = np.dot(word_embeddings, word_vec) / (
#         np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(word_vec))
#     similar_indices = similarities.argsort()[-top_n - 1:-1][::-1]
#     return [index_to_word[i] for i in similar_indices if i != idx]

# print(find_similar_words("government", word_embeddings))


In [None]:

def get_similarity_score(word1, word2, word_embeddings, word_to_index):
    """
    Calculates the cosine similarity score between two words using word embeddings.

    Args:
        word1 (str): The first word.
        word2 (str): The second word.
        word_embeddings (np.ndarray): The word embeddings matrix.
        word_to_index (dict): A dictionary mapping words to their indices in the embeddings matrix.

    Returns:
        float: The cosine similarity score between the two words.
    """
    if word1 not in word_to_index or word2 not in word_to_index:
        return 0.0  # Return 0 if either word is not in the vocabulary

    word1_index = word_to_index[word1]
    word2_index = word_to_index[word2]

    word1_vec = word_embeddings[word1_index]
    word2_vec = word_embeddings[word2_index]

    #Cosine Similarity of two embeddings
    similarity = np.dot(word1_vec, word2_vec) / (np.linalg.norm(word1_vec) * np.linalg.norm(word2_vec))

    return similarity*10 # since Simlex Dataset has scores scaled across 0 to 10

# Example usage:
word1 = "king"
word2 = "queen"
similarity_score = get_similarity_score(word1, word2, word_embeddings, word_to_index)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score:.4f}")

Similarity between 'king' and 'queen': 9.3065


In [67]:
# Function to load the test dataset from a tab-separated text file
import pandas as pd
from scipy.stats import pearsonr, spearmanr

def load_txt_dataset(file_path):
    columns = ['word1', 'word2', 'POS', 'SimLex999', 'conc(w1)', 'conc(w2)',
               'concQ', 'Assoc(USF)', 'SimAssoc333', 'SD(SimLex)']
    return pd.read_csv(file_path, delimiter="\t", names=columns, skiprows=1)

# Evaluation function
def evaluate_model_with_txt(file_path, word_embeddings, word_to_index):
    test_data = load_txt_dataset(file_path)

    # Extract word pairs and human similarity scores
    word_pairs = test_data[['word1', 'word2']]
    human_scores = test_data['SimLex999']

    # Compute model-predicted similarity scores
    predicted_scores = []
    for _, row in word_pairs.iterrows():
        similarity = get_similarity_score(row['word1'], row['word2'], word_embeddings, word_to_index)
        predicted_scores.append(similarity)

    # Filter out missing words (None scores)
    valid_indices = [i for i, score in enumerate(predicted_scores) if score is not None]
    human_scores = human_scores.iloc[valid_indices]
    predicted_scores = np.array([predicted_scores[i] for i in valid_indices])
    # print(predicted_scores)
    # print(human_scores)

    # Calculate Pearson and Spearman correlations
    pearson_corr, _ = pearsonr(human_scores, predicted_scores)
    spearman_corr, _ = spearmanr(human_scores, predicted_scores)

    return {
        "Pearson Correlation": pearson_corr,
        "Spearman Correlation": spearman_corr
    }

test_dataset_path = "SimLex-999.txt"

evaluation_results = evaluate_model_with_txt(test_dataset_path, word_embeddings, word_to_index)

print("Evaluation Results:")
print(f"Pearson Correlation: {evaluation_results['Pearson Correlation']:.4f}")
print(f"Spearman Correlation: {evaluation_results['Spearman Correlation']:.4f}")


Evaluation Results:
Pearson Correlation: 0.0752
Spearman Correlation: 0.0802
