# Embeddings Demo using PyTorch

## Word2Vec

### Import Necessary Libraries

In [105]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Prepare Corpus and **Vocabulary**

In [106]:
# Step 1: Prepare a simple corpus
corpus = "He is the king . The king is royal. She is the royal queen . He is a prince, and she is a princess ."

# Step 2: Tokenize and build vocabulary
tokens = corpus.lower().split()  # Tokenize and lower-case words
vocab = set(tokens)  # Create a set of unique words
word2idx = {word: idx for idx, word in enumerate(vocab)}  # Create word to index mapping
idx2word = {idx: word for word, idx in word2idx.items()}  # Create index to word mapping
vocab_size = len(vocab)  # Calculate the size of vocabulary

In [107]:
print("Tokens: ", tokens)
print("Vocabulary: ", vocab)
print("Word to index mapping: ", word2idx)
print("Index to word mapping: ", idx2word)
print("Vocabulary size: ", vocab_size)

Tokens:  ['he', 'is', 'the', 'king', '.', 'the', 'king', 'is', 'royal.', 'she', 'is', 'the', 'royal', 'queen', '.', 'he', 'is', 'a', 'prince,', 'and', 'she', 'is', 'a', 'princess', '.']
Vocabulary:  {'king', 'and', 'a', 'he', 'is', 'royal.', 'she', 'royal', '.', 'princess', 'queen', 'prince,', 'the'}
Word to index mapping:  {'king': 0, 'and': 1, 'a': 2, 'he': 3, 'is': 4, 'royal.': 5, 'she': 6, 'royal': 7, '.': 8, 'princess': 9, 'queen': 10, 'prince,': 11, 'the': 12}
Index to word mapping:  {0: 'king', 1: 'and', 2: 'a', 3: 'he', 4: 'is', 5: 'royal.', 6: 'she', 7: 'royal', 8: '.', 9: 'princess', 10: 'queen', 11: 'prince,', 12: 'the'}
Vocabulary size:  13


### Create Dataset

In [108]:
context_window = 2  # Define the size of context window
data = []  # Initialize empty list to hold data

# Loop through each token and extract its context and target word
for i in range(context_window, len(tokens) - context_window):
    context = [tokens[i - t] for t in range(context_window, 0, -1)] + [tokens[i + t] for t in range(1, context_window + 1)]
    target = tokens[i]
    data.append((context, target))  # Append the context and target word as a tuple to the data list

### Define Skip-gram Model

In [109]:
# Step 4: Define the Skip-gram model architecture
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SkipGram, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_dim)
        self.out_embed = nn.Embedding(vocab_size, embed_dim)
        
        
    def forward(self, target, context):
        in_embeds = self.in_embed(target)
        out_embeds = self.out_embed(context)
        scores = torch.matmul(out_embeds, in_embeds.t())
        return scores.squeeze()

### Initialize Model and Train

In [110]:
# Step 5: Initialize the model, loss, and optimizer
embed_dim = 50
model = SkipGram(vocab_size, embed_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [111]:
print(model)

SkipGram(
  (in_embed): Embedding(13, 50)
  (out_embed): Embedding(13, 50)
)


In [112]:
# Step 6: Training loop
epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        print(context, target)
        context_idx = torch.tensor([word2idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor([word2idx[target]], dtype=torch.long)
        
        print("Max context_idx:", torch.max(context_idx).item())  # Debug line
        print("Min context_idx:", torch.min(context_idx).item())  # Debug line

        print("context_idx shape:", context_idx.shape)  # Debug line
        print(context_idx)  # Debug line
        optimizer.zero_grad()
        scores = model(target_idx, context_idx)
        # change scores to be a float tensor

        print("Scores", scores)
        print("Scores shape:", scores.shape)  # Debug line
        print("Score dtype: ", scores.dtype)  # Debug line

        print("context_idx dtype: ", context_idx.dtype)  # Debug line

        context_idx = context_idx.float()
        print("context_idx: ", context_idx)
        loss = criterion(scores, context_idx)
        print(loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        print("Loss: ", loss.item()) # Debug line
        
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data):.4f}')

['he', 'is', 'king', '.'] the
Max context_idx: 8
Min context_idx: 0
context_idx shape: torch.Size([4])
tensor([3, 4, 0, 8])
Scores tensor([  6.0168,  -6.7091,   3.7314, -10.5706], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor([3., 4., 0., 8.])
tensor(185.0557, grad_fn=<DivBackward1>)
Loss:  185.0556640625
['is', 'the', '.', 'the'] king
Max context_idx: 12
Min context_idx: 4
context_idx shape: torch.Size([4])
tensor([ 4, 12,  8, 12])
Scores tensor([-7.5164, -4.4246, 10.0371, -4.4246], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor([ 4., 12.,  8., 12.])
tensor(417.2960, grad_fn=<DivBackward1>)
Loss:  417.29595947265625
['the', 'king', 'the', 'king'] .
Max context_idx: 12
Min context_idx: 0
context_idx shape: torch.Size([4])
tensor([12,  0, 12,  0])
Scores tensor([12.4147, -1.2226, 12.4147, -1.2226], grad_f

### Evaluate Model

In [113]:
# Step 7: Evaluation
def most_similar(word, word2idx, idx2word, embedding_matrix, topk=5):
    word_embedding = embedding_matrix[word2idx[word]]
    similarities = cosine_similarity([word_embedding], embedding_matrix)[0]
    print([tup for tup in zip(idx2word.values(), similarities)])
    similar_words = [(idx2word[idx], similarities[idx]) for idx in np.argsort(similarities, axis=-1)[-topk-1:-1][::-1]]
    return similar_words

In [114]:
embedding_matrix = model.in_embed.weight.data.numpy()
# print(embedding_matrix) # Debug line
most_similar_words = most_similar('king', word2idx, idx2word, embedding_matrix, topk=5)
print(most_similar_words)

[('king', 1.0), ('and', 0.0076950155), ('a', 0.2688496), ('he', -0.07239942), ('is', 0.09491261), ('royal.', 0.033431225), ('she', -0.116904765), ('royal', -0.22678074), ('.', 0.23785788), ('princess', -0.067026526), ('queen', 0.06646336), ('prince,', -0.104524344), ('the', 0.082939275)]
[('a', 0.2688496), ('.', 0.23785788), ('is', 0.09491261), ('the', 0.082939275), ('queen', 0.06646336)]
