In [12]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.distributions.categorical import Categorical

from textdataset import TextDataset
from rnn import RNN

In [2]:
with open('../Texts/shelley.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

start_idx = text.find('_To Mrs. Saville')
end_idx = text.find('End of the Project Gutenberg')

text = text[start_idx:end_idx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 437417
Unique Characters: 91


In [3]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)

print('Text encoded shape:', text_encoded.shape)

print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>',
      ''.join(char_array[text_encoded[15:21]]))

for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

Text encoded shape: (437417,)
_To Mrs. Savill == Encoding ==> [52 44 67  1 37 70 71 10  1 43 53 74 61 64 64]
[57  8  1 29 66 59] == Reverse ==> e, Eng
52 -> _
44 -> T
67 -> o
1 ->  
37 -> M


In [4]:
seq_length = 40         # sequence length
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size]
               for i in range(len(text_encoded)-chunk_size+1)]

## inspection:
for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length] 
    print(input_seq, ' -> ', target)
    print(repr(''.join(char_array[input_seq])), 
          ' -> ', repr(''.join(char_array[target])))

seq_dataset = TextDataset(torch.tensor(np.array(text_chunks)))

for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ',
          repr(''.join(char_array[seq])))
    print('Target (y): ',
          repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

[52 44 67  1 37 70 71 10  1 43 53 74 61 64 64 57  8  1 29 66 59 64 53 66
 56 10 52  0  0  0 43 72 10  1 40 57 72 57 70 71]  ->  54
'_To Mrs. Saville, England._\n\n\nSt. Peters'  ->  'b'
 Input (x):  '_To Mrs. Saville, England._\n\n\nSt. Peters'
Target (y):  'To Mrs. Saville, England._\n\n\nSt. Petersb'

 Input (x):  'To Mrs. Saville, England._\n\n\nSt. Petersb'
Target (y):  'o Mrs. Saville, England._\n\n\nSt. Petersbu'



In [5]:
device = 'cpu'

In [6]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, 
                    shuffle=True, drop_last=True)

### Creating the model

In [7]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)

# optimizer and loss
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Training the model

In [8]:
num_epochs = 10_000
torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))            # ?
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.5124
Epoch 500 loss: 1.6026
Epoch 1000 loss: 1.4095
Epoch 1500 loss: 1.3036
Epoch 2000 loss: 1.2517
Epoch 2500 loss: 1.2674
Epoch 3000 loss: 1.1549
Epoch 3500 loss: 1.1479
Epoch 4000 loss: 1.0934
Epoch 4500 loss: 1.0594
Epoch 5000 loss: 1.0699
Epoch 5500 loss: 0.9967
Epoch 6000 loss: 0.9868
Epoch 6500 loss: 0.9610
Epoch 7000 loss: 0.9275
Epoch 7500 loss: 0.9387
Epoch 8000 loss: 0.8852
Epoch 8500 loss: 0.8915
Epoch 9000 loss: 0.8911
Epoch 9500 loss: 0.8253


### Saving the model for later use

In [9]:
path = 'shelley_generator.pt'
torch.save(model, path)

### Generating new text

In [18]:
def sample(model, starting_str, 
           len_generated_text=500, 
           scale_factor=2.0):

    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
        
    return generated_str

torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='Victor'))

Victor,
but when I conjectured that the sun was of a calmer and let me for ever on the stars or the view of the black earth. Yet he threw human beings. Alas! I returned hopeless and the sea roared in a part of the
ice had been accustomed, but you may have power to return to my labours. I took the instrument of my conversation,
which by the greatest places of man. These every hand only one wandered in the world with a subject for me. Yet he had passed three entirely occapions of the neighbourhood of th
