# Recurrent Neural Networks

## Generating Text from a Book

In [5]:
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1143k  100 1143k    0     0   829k      0  0:00:03  0:00:01  0:00:02  367k    0  0:00:01  0:00:01 --:--:--  828k


In [7]:
import numpy as np

with open('1268-0.txt', 'r', encoding='utf-8') as file:
    text = file.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]
chars_set = set(text)

chars_sorted = sorted(chars_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

print(text[:100], '== Encoding ==>', text_encoded[:100])

THE MYSTERIOUS ISLAND ***




THE MYSTERIOUS ISLAND

by Jules Verne

1874




PART 1--DROPPED FROM T == Encoding ==> [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1 37 47 40 29 42 32  1 10 10
 10  0  0  0  0  0 48 36 33  1 41 53 47 48 33 46 37 43 49 47  1 37 47 40
 29 42 32  0  0 56 79  1 38 75 66 59 73  1 50 59 72 68 59  0  0 16 23 22
 19  0  0  0  0  0 44 29 46 48  1 16 12 12 32 46 43 44 44 33 32  1 34 46
 43 41  1 48]


In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader

seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded)-chunk_size+1)]

In [9]:
from torch.utils.data import Dataset
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        return self.text_chunks[idx][:-1].long(), self.text_chunks[idx][1:].long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [10]:
batch_size = 64
torch.manual_seed(42)
seq_dataloader = DataLoader(seq_dataset, batch_size, shuffle=True, drop_last=True)

In [11]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(
            vocab_size, # amount of characters
            embed_dim # size of the embedding vector
            )
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(
            embed_dim, # size of the input embedding vector
            rnn_hidden_size, # size of the hidden state
            batch_first=True
            )
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [12]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 1000
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dataloader))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss += loss.item()/seq_length
    if (epoch+1) % 100 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

Epoch 100/1000, Loss: 70.9976
Epoch 200/1000, Loss: 61.7498
Epoch 300/1000, Loss: 58.8974
Epoch 400/1000, Loss: 61.2124
Epoch 500/1000, Loss: 54.6316
Epoch 600/1000, Loss: 55.2262
Epoch 700/1000, Loss: 54.7025
Epoch 800/1000, Loss: 55.0149
Epoch 900/1000, Loss: 55.3729
Epoch 1000/1000, Loss: 53.3664


In [17]:
from torch.distributions import Categorical

from time import sleep

def sample(model, starting_str, 
           len_generated_text=500, 
           scale_factor=1.0):

    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        yield char_array[last_char]
        sleep(0.05)
    return generated_str

In [19]:
torch.manual_seed(1)
model.to('cpu')

stream = sample(model, starting_str='The island was', len_generated_text=500, scale_factor=0.7)

for c in stream:
    print(c, end='')

 ad shumen soepus fire it actuded, my ffirm good aurmousd, was down, but helted prupercoad as
fifteer is neim: direction of the mind
from kind thovemenable artances, and agreets a goved in an’thets, then
gentimes which imaen, Ayrton. “Deverew them kind, grautilieves with critef him! It was if they unfaund
Brot’s chowVard. 3
tide one’s
o’cquarus.
The trangars, if a horizon’, levelves qearcoll, and
fallen articlevellow at
that days in wild, but when it attempts eStrain
swagers!

It is
Washed! Cyru