# WK14

## RNNs

### Classification + Classification + Classification + ...

- Output of NN becomes an input for next prediction
- This allows us to have inputs of different lengths

#### Code:
- https://machinelearningmastery.com/lstm-for-time-series-prediction-in-pytorch/

#### Explanation:
- https://machinelearningmastery.com/an-introduction-to-recurrent-neural-networks-and-the-math-that-powers-them/
- https://machinelearningmastery.com/models-sequence-prediction-recurrent-neural-networks/
- https://medium.com/@prudhviraju.srivatsavaya/lstm-vs-gru-c1209b8ecb5a
- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
- https://www.kaggle.com/code/dota2player/next-word-prediction-with-lstm-pytorch

In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/WK14/raw/main/WK14_utils.py

!wget -q -P ./data/text https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/dickinson.txt
!wget -qO- https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/rappers.tar.gz | tar xz

In [None]:
import pandas as pd
import torch

from random import choice

from torch import nn, Tensor
from torch.utils.data import DataLoader

from WK14_utils import TextSequenceDataset

In [None]:
class NGramDataset(TextSequenceDataset):
  def __init__(self, text, max_words=200_000, window=2):
    super().__init__(text, max_words, window, symmetric_context=False)
    self.words_t = self.encode(self.words)

  def __len__(self):
    return len(self.words) - self.window

  def __getitem__(self, idx):
    target = self.words_t[idx + self.window]
    context = self.words_t[idx : idx + self.window]
    return context, target

In [None]:
with open("./data/text/dickinson.txt", "r") as f:
  dickinson_text = f.read().split("\n")

In [None]:
lyrics_df = pd.read_csv("./data/text/rappers.csv")
rapper_text = lyrics_df["lyric"].values

In [None]:
dataset = NGramDataset(text=dickinson_text, max_words=500_000, window=5)
train_dl = DataLoader(dataset, batch_size=4096, shuffle=True)

In [None]:
class NextWordGRU(nn.Module):
  def __init__(self, vocab_size, embedding_dim=64, hidden_dim=256, num_layers=2):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x, hidden):
    x = self.embedding(x)
    out, hidden = self.gru(x, hidden)
    out = self.fc(out[:, -1, :])
    return out, hidden

In [None]:
mdevice = "cuda" if torch.cuda.is_available() else "cpu"

model = NextWordGRU(vocab_size=len(dataset.wtoi), embedding_dim=64, hidden_dim=256, num_layers=2).to(mdevice)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

input, target = next(iter(train_dl))
print(input.shape, target.shape)

output, hidden = model(input, None)
print(output.shape, hidden.shape)

In [None]:
for e in range(32):
  model.train()
  for input, target in train_dl:
    optim.zero_grad()
    hidden = None
    output, hidden = model(input, hidden)
    loss = loss_fn(output, target)
    loss.backward()
    optim.step()

  if e % 4 == 3:
    print(f"Epoch: {e} loss: {loss.item():.4f}")

In [None]:
query = dataset.encode(["you"]).unsqueeze(0)

model.eval()
with torch.no_grad():
  output, _ = model(query, None)
  output = output.squeeze()
  top1 = output.argmax()
  top5 = output.argsort(descending=True)[:5]
  print(dataset.decode_word(top1))
  print(dataset.decode(top5))

In [None]:
phrase = ["You"]
hidden = None
num_candidates = 2

model.eval()
with torch.no_grad():
  for w in range(10):
    query = dataset.encode(phrase).unsqueeze(0)
    output, hidden = model(query, None)
    output = output.squeeze()
    candidates = dataset.decode(output.argsort(descending=True)[:num_candidates])
    phrase.append(choice(candidates))

print(phrase)

## Possible Next Steps

- Implement a translator between two text corpus
  - Get a phrase from one text: [W0, W1, W2, etc]
  - Get list of embeddings and their relative distances/directions
  - Find embedding of W0 in second dataset
  - Follow the directions from first dataset, while moving around the second dataset