<a href="https://colab.research.google.com/github/MohakSomani/PreCog-Task-NLP---Words-Sentence-Phrase-Similarity/blob/main/task_a_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import torch
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0))


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import brown
from collections import Counter
import numpy as np
import random


In [4]:
# Download the Brown corpus if not already downloaded
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [5]:
# Preprocessing
sentences = brown.sents()[:]  # Use only the first 1000 sentences for quick testing
corpus = [word.lower() for sentence in sentences for word in sentence]
vocab = set(corpus)
vocab_size = len(vocab)

In [6]:
# Create word to index and index to word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [7]:
# Generate training data (CBOW model)
def generate_cbow_data(corpus, window_size=2):
    data = []
    for idx, word in enumerate(corpus):
        if idx < window_size or idx >= len(corpus) - window_size:
            continue
        context = [word_to_index[corpus[i]] for i in range(idx - window_size, idx + window_size + 1) if i != idx]
        target = word_to_index[word]
        data.append((context, target))
    return data

window_size = 2
training_data = generate_cbow_data(corpus, window_size)

In [8]:
# Define the Word2Vec model
class Word2VecCBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecCBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context).mean(dim=1)
        output = self.output(embedded)
        return output

In [9]:
# Hyperparameters
embedding_dim = 50  # Reduced for faster training
epochs = 10         # Fewer epochs for quick testing
batch_size = 64    # Smaller batch size for quicker iterations
learning_rate = 0.01

In [10]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [11]:
# Create model, loss function, and optimizer
model = Word2VecCBOW(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
# Prepare batches for training
def get_batches(training_data, batch_size):
    random.shuffle(training_data)
    for i in range(0, len(training_data), batch_size):
        batch = training_data[i:i + batch_size]
        contexts, targets = zip(*batch)
        yield torch.tensor(contexts, dtype=torch.long), torch.tensor(targets, dtype=torch.long)


In [13]:
# Training loop
for epoch in range(epochs):
    total_loss = 0
    for contexts, targets in get_batches(training_data, batch_size):
        contexts, targets = contexts.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(contexts)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 121068.8386
Epoch 2, Loss: 109024.3246
Epoch 3, Loss: 104458.1656
Epoch 4, Loss: 101682.8668
Epoch 5, Loss: 99866.2830
Epoch 6, Loss: 98606.1743
Epoch 7, Loss: 97735.5712
Epoch 8, Loss: 97029.2780
Epoch 9, Loss: 96555.7973
Epoch 10, Loss: 96208.1430


In [14]:
# Save the word embeddings
word_embeddings = model.embeddings.weight.detach().cpu().numpy()

In [15]:
# # Example usage: find similar words
# def find_similar_words(word, word_embeddings, top_n=5):
#     if word not in word_to_index:
#         return []
#     idx = word_to_index[word]
#     word_vec = word_embeddings[idx]
#     similarities = np.dot(word_embeddings, word_vec) / (
#         np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(word_vec))
#     similar_indices = similarities.argsort()[-top_n - 1:-1][::-1]
#     return [index_to_word[i] for i in similar_indices if i != idx]

# print(find_similar_words("government", word_embeddings))


In [18]:

def get_similarity_score(word1, word2, word_embeddings, word_to_index):
    """
    Calculates the cosine similarity score between two words using word embeddings.

    Args:
        word1 (str): The first word.
        word2 (str): The second word.
        word_embeddings (np.ndarray): The word embeddings matrix.
        word_to_index (dict): A dictionary mapping words to their indices in the embeddings matrix.

    Returns:
        float: The cosine similarity score between the two words.
    """
    if word1 not in word_to_index or word2 not in word_to_index:
        return 0.0  # Return 0 if either word is not in the vocabulary

    word1_index = word_to_index[word1]
    word2_index = word_to_index[word2]

    word1_vec = word_embeddings[word1_index]
    word2_vec = word_embeddings[word2_index]

    similarity = np.dot(word1_vec, word2_vec) / (np.linalg.norm(word1_vec) * np.linalg.norm(word2_vec))

    return similarity

# Example usage:
word1 = "king"
word2 = "queen"
similarity_score = get_similarity_score(word1, word2, word_embeddings, word_to_index)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score:.4f}")

Similarity between 'king' and 'queen': 0.3180
