In [28]:
# !pip install torchtext

In [44]:
# importing necessary libraries
import numpy as np
import pandas as pd
import sys,  torch, torch.nn as nn
from torch.utils.data import DataLoader,Dataset, TensorDataset
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pad_sequence
import tqdm
from collections import Counter

In [45]:
import pandas as pd
csv_path="dataset/poems-100.csv"

In [35]:
# tokenize:
text = "\n".join(pd.read_csv(csv_path)["text"].astype(str)).lower()
tokens = text.split()
vocab = sorted(set(tokens))
w2i = {w:i for i,w in enumerate(vocab)}
i2w = {i:w for w,i in w2i.items()}
encoded = [w2i[w] for w in tokens]

In [36]:
# sequence
seq_len = 6
X, Y = [], []
for i in range(len(encoded)-seq_len):
    X.append(encoded[i:i+seq_len])
    Y.append(encoded[i+seq_len])

In [37]:
# dataloader

X, Y = torch.tensor(X), torch.tensor(Y)
loader = DataLoader(TensorDataset(X,Y), batch_size=32, shuffle=True)

In [38]:
# Model:


class RNN(nn.Module):
    def __init__(self,vocab):
        super().__init__()
        self.emb = nn.Embedding(vocab,64)
        self.rnn = nn.RNN(64,128,batch_first=True)
        self.fc = nn.Linear(128,vocab)

    def forward(self,x):
        x = self.emb(x)
        o,_ = self.rnn(x)
        return self.fc(o[:,-1])

In [39]:
model = RNN(len(vocab))
opt = torch.optim.Adam(model.parameters(),lr=0.003)
loss_fn = nn.CrossEntropyLoss()

In [43]:
# training
for epoch in tqdm.tqdm(range(10)):
    total=0
    for xb,yb in loader:
        opt.zero_grad()
        loss = loss_fn(model(xb),yb)
        loss.backward()
        opt.step()
        total+=loss.item()
    print("epoch",epoch,"loss",total/len(loader))

 10%|█         | 1/10 [00:06<00:55,  6.13s/it]

epoch 0 loss 6.577138866212358


 20%|██        | 2/10 [00:12<00:49,  6.18s/it]

epoch 1 loss 5.461843350880513


 30%|███       | 3/10 [00:18<00:43,  6.21s/it]

epoch 2 loss 4.224829150665006


 40%|████      | 4/10 [00:24<00:37,  6.25s/it]

epoch 3 loss 3.1482478250661456


 50%|█████     | 5/10 [00:31<00:31,  6.28s/it]

epoch 4 loss 2.3660343245423627


 60%|██████    | 6/10 [00:37<00:24,  6.19s/it]

epoch 5 loss 1.8267150272989952


 70%|███████   | 7/10 [00:43<00:18,  6.15s/it]

epoch 6 loss 1.433049115218598


 80%|████████  | 8/10 [00:49<00:12,  6.13s/it]

epoch 7 loss 1.1666541586101131


 90%|█████████ | 9/10 [00:55<00:06,  6.21s/it]

epoch 8 loss 0.9708469511620594


100%|██████████| 10/10 [01:02<00:00,  6.26s/it]

epoch 9 loss 0.8266851588563191





In [None]:
#  poem generation:
def generate(seed="love is",words=40):
    model.eval()
    toks = seed.lower().split()
    idxs = [w2i.get(w,0) for w in toks]

    for _ in range(words):
        x = torch.tensor([idxs[-seq_len:]])
        with torch.no_grad():
            p = torch.softmax(model(x),dim=-1)
        nxt = torch.multinomial(p,1).item()
        idxs.append(nxt)

    return " ".join(i2w[i] for i in idxs)

print("\nGenerated poem:\n")
print(generate())


Extra:
- save model weights, and model
- train on a larger dataset
- reduce the size of model
- show how data passes to model layers
- compare poem outputs with different training levels.