In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

In [2]:
# Sample corpus
corpus = [
    "the quick brown fox jumps over the lazy dog",
    "the fox is quick and the dog is lazy",
    "the dog and the fox are friends"
]

In [5]:
# Tokenize corpus
words = set(" ".join(corpus).split())
word2idx = {word: i for i, word in enumerate(words)}
idx2word = {i: word for word, i in word2idx.items()}
vocab_size = len(words)

In [8]:
vocab_size

12

In [9]:
# Hyperparameters
EMBEDDING_DIM = 10
CONTEXT_SIZE = 2
EPOCHS = 100
LR = 0.01

In [10]:
def generate_cbow_data(corpus,context_size=2):
    data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i in range(context_size, len(tokens) - context_size):
            context = [tokens[j] for j in range(i - context_size, i + context_size + 1) if j != i]
            target = tokens[i]
            data.append((context, target))
    return data

In [11]:
cbow_data = generate_cbow_data(corpus, CONTEXT_SIZE)
cbow_data

[(['the', 'quick', 'fox', 'jumps'], 'brown'),
 (['quick', 'brown', 'jumps', 'over'], 'fox'),
 (['brown', 'fox', 'over', 'the'], 'jumps'),
 (['fox', 'jumps', 'the', 'lazy'], 'over'),
 (['jumps', 'over', 'lazy', 'dog'], 'the'),
 (['the', 'fox', 'quick', 'and'], 'is'),
 (['fox', 'is', 'and', 'the'], 'quick'),
 (['is', 'quick', 'the', 'dog'], 'and'),
 (['quick', 'and', 'dog', 'is'], 'the'),
 (['and', 'the', 'is', 'lazy'], 'dog'),
 (['the', 'dog', 'the', 'fox'], 'and'),
 (['dog', 'and', 'fox', 'are'], 'the'),
 (['and', 'the', 'are', 'friends'], 'fox')]

In [12]:
def generate_skipgram_data(corpus, context_size=2):
    data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i in range(context_size, len(tokens) - context_size):
            target = tokens[i]
            for j in range(i - context_size, i + context_size + 1):
                if j != i:
                    data.append((target, tokens[j]))
    return data

In [14]:
skipgram_data = generate_skipgram_data(corpus, CONTEXT_SIZE)
skipgram_data

[('brown', 'the'),
 ('brown', 'quick'),
 ('brown', 'fox'),
 ('brown', 'jumps'),
 ('fox', 'quick'),
 ('fox', 'brown'),
 ('fox', 'jumps'),
 ('fox', 'over'),
 ('jumps', 'brown'),
 ('jumps', 'fox'),
 ('jumps', 'over'),
 ('jumps', 'the'),
 ('over', 'fox'),
 ('over', 'jumps'),
 ('over', 'the'),
 ('over', 'lazy'),
 ('the', 'jumps'),
 ('the', 'over'),
 ('the', 'lazy'),
 ('the', 'dog'),
 ('is', 'the'),
 ('is', 'fox'),
 ('is', 'quick'),
 ('is', 'and'),
 ('quick', 'fox'),
 ('quick', 'is'),
 ('quick', 'and'),
 ('quick', 'the'),
 ('and', 'is'),
 ('and', 'quick'),
 ('and', 'the'),
 ('and', 'dog'),
 ('the', 'quick'),
 ('the', 'and'),
 ('the', 'dog'),
 ('the', 'is'),
 ('dog', 'and'),
 ('dog', 'the'),
 ('dog', 'is'),
 ('dog', 'lazy'),
 ('and', 'the'),
 ('and', 'dog'),
 ('and', 'the'),
 ('and', 'fox'),
 ('the', 'dog'),
 ('the', 'and'),
 ('the', 'fox'),
 ('the', 'are'),
 ('fox', 'and'),
 ('fox', 'the'),
 ('fox', 'are'),
 ('fox', 'friends')]

In [15]:
class CBOW(nn.Module):
    def __init__(self,vocab_size,embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self,context_words):
        embeds = self.embeddings(context_words) # (batch_size, context_size, embedding_dim)
        h = embeds.mean(dim=1)  # Averaging embeddings
        out = self.linear(h)  # Output layer
        return out

In [16]:
cbow_model = CBOW(vocab_size, EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cbow_model.parameters(), lr=LR)

In [17]:
for epoch in range(EPOCHS):
    total_loss = 0
    for context,target in cbow_data:
        context_indices = torch.tensor([word2idx[word] for word in context])
        target_index = torch.tensor([word2idx[target]])
        optimizer.zero_grad()
        output = cbow_model(context_indices.unsqueeze(0))
        loss = criterion(output, target_index)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

Epoch 0, Loss: 32.8300
Epoch 10, Loss: 14.1051
Epoch 20, Loss: 5.3889
Epoch 30, Loss: 2.1940
Epoch 40, Loss: 1.0703
Epoch 50, Loss: 0.6160
Epoch 60, Loss: 0.3970
Epoch 70, Loss: 0.2762
Epoch 80, Loss: 0.2027
Epoch 90, Loss: 0.1547


In [18]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        embed = self.embeddings(target_word)  # (batch_size, embedding_dim)
        out = self.linear(embed)  # Output layer
        return out

In [19]:
skipgram_model = SkipGram(vocab_size, EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(skipgram_model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    total_loss = 0
    for target, context in skipgram_data:
        target_index = torch.tensor([word2idx[target]])
        context_index = torch.tensor([word2idx[context]])

        optimizer.zero_grad()
        output = skipgram_model(target_index)
        loss = criterion(output, context_index)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

Epoch 0, Loss: 137.0931
Epoch 10, Loss: 96.5822
Epoch 20, Loss: 93.0257
Epoch 30, Loss: 92.0243
Epoch 40, Loss: 91.5886
Epoch 50, Loss: 91.3385
Epoch 60, Loss: 91.1720
Epoch 70, Loss: 91.0509
Epoch 80, Loss: 90.9574
Epoch 90, Loss: 90.8822


In [20]:
import torch.nn.functional as F

def get_word_embedding(model, word):
    """Retrieve the embedding vector for a given word."""
    word_idx = torch.tensor([word2idx[word]])
    embedding = model.embeddings(word_idx)
    return embedding.detach().numpy().squeeze()  # Convert tensor to numpy array

def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors."""
    vec1 = torch.tensor(vec1)
    vec2 = torch.tensor(vec2)
    similarity = F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0))
    return similarity.item()

In [21]:
def most_similar(model, target_word, top_n=5):
    """Find the most similar words to the given target word based on cosine similarity."""
    target_embedding = get_word_embedding(model, target_word)
    similarities = []

    for word in words:  # Iterate over all words in the vocabulary
        if word == target_word:
            continue
        word_embedding = get_word_embedding(model, word)
        similarity = cosine_similarity(target_embedding, word_embedding)
        similarities.append((word, similarity))

    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

In [22]:
print("Most similar words to 'fox' using CBOW:")
print(most_similar(cbow_model, "fox"))

# Test with Skip-gram model
print("\nMost similar words to 'fox' using Skip-gram:")
print(most_similar(skipgram_model, "fox"))

Most similar words to 'fox' using CBOW:
[('quick', 0.23087164759635925), ('the', 0.04092662036418915), ('jumps', 0.01653880998492241), ('are', -0.11102814227342606), ('dog', -0.13629382848739624)]

Most similar words to 'fox' using Skip-gram:
[('is', 0.3401443660259247), ('lazy', 0.2606309652328491), ('brown', -0.05256767198443413), ('over', -0.05700305849313736), ('jumps', -0.08019979298114777)]
