In [1]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should print NVIDIA RTX 4060


False


  return torch._C._cuda_getDeviceCount() > 0


RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 803: system has unsupported display driver / cuda driver combination

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import brown
from collections import Counter
import numpy as np
import random


In [18]:
# Download the Brown corpus if not already downloaded
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /home/mohak/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [19]:
# Preprocessing
sentences = brown.sents()[:]  # Use only the first 1000 sentences for quick testing
corpus = [word.lower() for sentence in sentences for word in sentence]
vocab = set(corpus)
vocab_size = len(vocab)

In [20]:
# Create word to index and index to word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [21]:
# Generate training data (CBOW model)
def generate_cbow_data(corpus, window_size=2):
    data = []
    for idx, word in enumerate(corpus):
        if idx < window_size or idx >= len(corpus) - window_size:
            continue
        context = [word_to_index[corpus[i]] for i in range(idx - window_size, idx + window_size + 1) if i != idx]
        target = word_to_index[word]
        data.append((context, target))
    return data

window_size = 2
training_data = generate_cbow_data(corpus, window_size)

In [22]:
# Define the Word2Vec model
class Word2VecCBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecCBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context).mean(dim=1)
        output = self.output(embedded)
        return output

In [23]:
# Hyperparameters
embedding_dim = 50  # Reduced for faster training
epochs = 2         # Fewer epochs for quick testing
batch_size = 64    # Smaller batch size for quicker iterations
learning_rate = 0.01

In [24]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [25]:
# Create model, loss function, and optimizer
model = Word2VecCBOW(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
# Prepare batches for training
def get_batches(training_data, batch_size):
    random.shuffle(training_data)
    for i in range(0, len(training_data), batch_size):
        batch = training_data[i:i + batch_size]
        contexts, targets = zip(*batch)
        yield torch.tensor(contexts, dtype=torch.long), torch.tensor(targets, dtype=torch.long)


In [27]:
# Training loop
for epoch in range(epochs):
    total_loss = 0
    for contexts, targets in get_batches(training_data, batch_size):
        contexts, targets = contexts.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(contexts)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 121084.1597
Epoch 2, Loss: 108997.8549


In [28]:
# Save the word embeddings
word_embeddings = model.embeddings.weight.detach().cpu().numpy()

In [29]:
# Example usage: find similar words
def find_similar_words(word, word_embeddings, top_n=5):
    if word not in word_to_index:
        return []
    idx = word_to_index[word]
    word_vec = word_embeddings[idx]
    similarities = np.dot(word_embeddings, word_vec) / (
        np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(word_vec))
    similar_indices = similarities.argsort()[-top_n - 1:-1][::-1]
    return [index_to_word[i] for i in similar_indices if i != idx]

print(find_similar_words("government", word_embeddings))


['cabinetmakers', 'misrepresentations', 'directors', 'courts', 'pessimism']
