# WK14

## NLP: Word2Vec

### One-Hot Encoding + Classification = Embedding

- Train network on a task that helps it encode information about the relationships of your data
- In the end we throw away the last layers and just use the encoded information for other purposes

It's like a more complex and rich form of finding covariances, where we end up with an abstract representation of the data. With text, we can train a network to predict words in a sequence, and then use the embeddings to perform search, similarity comparisons, generation, completion, visualizations, and other tasks.

#### Code:
- https://medium.com/@patrykmwieczorek/mastering-nlp-with-pytorch-word2vec-60a54030c720

#### Explanation:
- https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/WK14/raw/main/WK14_utils.py

!wget -q -P ./data/text https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/dickinson.txt
!wget -qO- https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/rappers.tar.gz | tar xz

In [None]:
import pandas as pd
import torch

from torch import nn, Tensor
from torch.utils.data import DataLoader

from WK14_utils import TextUtils, TextSequenceDataset

### SkipGram

This network creates embeddings by learning to predict a set of words related to a target word.

In [None]:
class SkipGramDataset(TextSequenceDataset):
  def __init__(self, text, max_words=200_000, window=2, symmetric_context=True):
    super().__init__(text, max_words, window, symmetric_context)
    self.X, self.Y = self.create_dataset(self.words, self.wtoi, self.window, self.symmetric_context)
    assert len(self.X) == len(self.Y)

  def create_dataset(self, words, wtoi, window, symmetric_context):
    stopwords = TextUtils.stopwords + ["=", ":", ",", "(", ")", "{", "}", "[", "]"]
    xs, ys = [], []

    for i in range(0, len(words)):
      minj = i - window if symmetric_context else i + 1
      maxj = i + window
      if words[i] in stopwords:
        continue
      center_word = wtoi[words[i].lower()]
      for j in range(minj, maxj + 1):
        if j == i or j < 0 or j > len(words) - 1 or words[j] in stopwords:
          continue
        context_word = wtoi[words[j].lower()]
        xs.append(center_word)
        ys.append(context_word)
    return Tensor(xs).long().to(self.device), Tensor(ys).long().to(self.device)

  def __getitem__(self, idx):
    if type(idx) is slice:
      return list(zip(self.X[idx], self.Y[idx]))
    return (self.X[idx], self.Y[idx])

  def __len__(self):
    return len(self.X)

In [None]:
with open("./data/text/dickinson.txt", "r") as f:
  dickinson_text = f.read().split("\n")

In [None]:
lyrics_df = pd.read_csv("./data/text/rappers.csv")
rapper_text = lyrics_df["lyric"].values

In [None]:
dataset = SkipGramDataset(text=dickinson_text, max_words=500_000, window=3, symmetric_context=False)
train_dl = DataLoader(dataset, batch_size=4096, shuffle=True)

In [None]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, embed_dim=128):
    super().__init__()
    self.center_embeds = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
    self.context_embeds = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

  def forward(self, x):
    center_word = self.center_embeds(x)
    scores = torch.matmul(center_word, self.context_embeds.weight.t())
    return scores

  def get_embedding(self, x):
    return self.center_embeds(x)

  def get_N_closest(self, x_emb, N=5, metric="lnorm"):
    # calculate similarity between x and all other embeddings
    if metric == "sine":
      cos_sim = nn.CosineSimilarity()
      similarities = cos_sim(x_emb, self.center_embeds.weight).squeeze()
      largest = True
    elif metric == "lnorm":
      similarities = torch.cdist(x_emb, self.center_embeds.weight).squeeze()
      largest = False
    return torch.topk(similarities, k=N, largest=largest)

  def get_N_closest_idx(self, x_idx, N=5, metric="lnorm"):
    # get word embedding
    x_emb = self.get_embedding(x_idx)

    # use embedding distances to return top-N similar word_idxs
    values, indices = self.get_N_closest(x_emb, N=N, metric=metric)
    return indices

In [None]:
mdevice = "cuda" if torch.cuda.is_available() else "cpu"

model = SkipGram(vocab_size=len(dataset.wtoi), embed_dim=64).to(mdevice)
optim = torch.optim.Adam(model.parameters(), lr=5e-3)
loss_fn = nn.CrossEntropyLoss()

ctr,ctx = next(iter(train_dl))
print(ctr.shape, ctx.shape)

ctx_pred = model(ctr)
print(ctx_pred.shape)

In [None]:
for e in range(32):
  model.train()
  for center, context in train_dl:
    optim.zero_grad()
    context_pred = model(center)
    loss = loss_fn(context_pred, context)
    loss.backward()
    optim.step()

  if e % 4 == 3:
    print(f"Epoch: {e} loss: {loss.item():.4f}")

In [None]:
query = dataset.encode_word("wild", return_tensors=True)

top5s = model.get_N_closest_idx(query, N=5, metric="sine")
top5l = model.get_N_closest_idx(query, N=5, metric="lnorm")

print(dataset.decode(top5s))
print(dataset.decode(top5l))

In [None]:
# with torch.no_grad():
q = dataset.encode(['wild', 'swoon', 'medicine', 'austere', 'stays'])
q_e = model.get_embedding(q)
q_diff = q_e[1:, :] - q_e[:-1, :]

## Possible Next Steps

- Implement a translator between two text corpus
  - Get a phrase from one text: [W0, W1, W2, etc]
  - Get list of embeddings and their relative distances/directions
  - Find embedding of W0 in second dataset
  - Follow the directions from first dataset, while moving around the second dataset