# LSTM neural network trained on the book "Cadaver Exquisito" by Agustina Bazterrica

In [5]:
!pip3 install torch
!pip3 install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [6]:
import torch 
import numpy as np
import torch.nn as nn
import fitz
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
torch.manual_seed(6)

<torch._C.Generator at 0x12039e9b0>

In [34]:
tokenized_text = []
f = fitz.open("Cadaver_exquisito_Agustina_Maria_Bazterrica.pdf")
for page in f:
    tokenized_text += list(page.get_text())
print(tokenized_text[:20])
print(len(tokenized_text))

max = tokenized_text[0]
for i in tokenized_text:
    if len(i) > len(max):
        max = i
print(max)


['L', 'a', ' ', 's', 'ú', 'b', 'i', 't', 'a', ' ', 'a', 'p', 'a', 'r', 'i', 'c', 'i', 'ó', 'n', ' ']
264207
L


In [8]:
unique_tokens = sorted(list(set(tokenized_text)))
vocab_size = len(unique_tokens)
print(unique_tokens)
print(vocab_size)

['\n', ' ', '!', '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '«', '»', '¿', 'Á', 'É', 'Í', 'Ú', 'á', 'é', 'í', 'ñ', 'ó', 'ú', 'ü', '—', '…', 'ﬁ', 'ﬂ']
92


In [9]:
char_to_id = {char: id for id, char in enumerate(unique_tokens)}
print(char_to_id)

{'\n': 0, ' ': 1, '!': 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, '/': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, ':': 19, ';': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'W': 44, 'Y': 45, 'Z': 46, 'a': 47, 'b': 48, 'c': 49, 'd': 50, 'e': 51, 'f': 52, 'g': 53, 'h': 54, 'i': 55, 'j': 56, 'k': 57, 'l': 58, 'm': 59, 'n': 60, 'o': 61, 'p': 62, 'q': 63, 'r': 64, 's': 65, 't': 66, 'u': 67, 'v': 68, 'w': 69, 'x': 70, 'y': 71, 'z': 72, '¡': 73, '«': 74, '»': 75, '¿': 76, 'Á': 77, 'É': 78, 'Í': 79, 'Ú': 80, 'á': 81, 'é': 82, 'í': 83, 'ñ': 84, 'ó': 85, 'ú': 86, 'ü': 87, '—': 88, '…': 89, 'ﬁ': 90, 'ﬂ': 91}


In [10]:
id_to_char = {id: char for char, id in char_to_id.items()}
print(id_to_char)

{0: '\n', 1: ' ', 2: '!', 3: '(', 4: ')', 5: ',', 6: '-', 7: '.', 8: '/', 9: '0', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: '9', 19: ':', 20: ';', 21: '?', 22: 'A', 23: 'B', 24: 'C', 25: 'D', 26: 'E', 27: 'F', 28: 'G', 29: 'H', 30: 'I', 31: 'J', 32: 'K', 33: 'L', 34: 'M', 35: 'N', 36: 'O', 37: 'P', 38: 'Q', 39: 'R', 40: 'S', 41: 'T', 42: 'U', 43: 'V', 44: 'W', 45: 'Y', 46: 'Z', 47: 'a', 48: 'b', 49: 'c', 50: 'd', 51: 'e', 52: 'f', 53: 'g', 54: 'h', 55: 'i', 56: 'j', 57: 'k', 58: 'l', 59: 'm', 60: 'n', 61: 'o', 62: 'p', 63: 'q', 64: 'r', 65: 's', 66: 't', 67: 'u', 68: 'v', 69: 'w', 70: 'x', 71: 'y', 72: 'z', 73: '¡', 74: '«', 75: '»', 76: '¿', 77: 'Á', 78: 'É', 79: 'Í', 80: 'Ú', 81: 'á', 82: 'é', 83: 'í', 84: 'ñ', 85: 'ó', 86: 'ú', 87: 'ü', 88: '—', 89: '…', 90: 'ﬁ', 91: 'ﬂ'}


In [11]:
tokenized_id_text = [char_to_id[i] for i in tokenized_text]
print(tokenized_id_text)

[33, 47, 1, 65, 86, 48, 55, 66, 47, 1, 47, 62, 47, 64, 55, 49, 55, 85, 60, 1, 50, 51, 1, 67, 60, 1, 68, 55, 64, 67, 65, 1, 58, 51, 66, 47, 58, 1, 63, 67, 51, 1, 47, 66, 47, 49, 47, 1, 47, 1, 58, 61, 65, 0, 47, 60, 55, 59, 47, 58, 51, 65, 1, 59, 61, 50, 55, 90, 49, 47, 1, 50, 51, 1, 59, 47, 60, 51, 64, 47, 1, 55, 64, 64, 51, 68, 51, 64, 65, 55, 48, 58, 51, 1, 51, 58, 1, 59, 67, 60, 50, 61, 19, 1, 50, 51, 65, 50, 51, 0, 58, 47, 65, 1, 90, 51, 64, 47, 65, 1, 54, 47, 65, 66, 47, 1, 58, 47, 65, 1, 59, 47, 65, 49, 61, 66, 47, 65, 1, 50, 51, 48, 51, 60, 1, 65, 51, 64, 1, 65, 55, 65, 66, 51, 59, 81, 66, 55, 49, 47, 59, 51, 60, 66, 51, 0, 65, 47, 49, 64, 55, 90, 49, 47, 50, 47, 65, 5, 1, 71, 1, 65, 67, 1, 49, 47, 64, 60, 51, 1, 71, 47, 1, 60, 61, 1, 62, 67, 51, 50, 51, 1, 65, 51, 64, 1, 49, 61, 60, 65, 67, 59, 55, 50, 47, 7, 1, 33, 61, 65, 0, 53, 61, 48, 55, 51, 64, 60, 61, 65, 1, 51, 60, 52, 64, 51, 60, 66, 47, 60, 1, 58, 47, 1, 65, 55, 66, 67, 47, 49, 55, 85, 60, 1, 49, 61, 60, 1, 67, 60, 47,

In [12]:
class BookDataset(Dataset):
    def __init__(self, tokenized_text, seq_length):
        self.tokenized_text = tokenized_text
        self.seq_length = seq_length

    def __len__(self):
        return len(self.tokenized_text) - self.seq_length
    
    def __getitem__(self, index):
        features = torch.tensor(self.tokenized_text[index:index + self.seq_length])
        labels = torch.tensor(self.tokenized_text[index + 1 : index + self.seq_length + 1])
        return features, labels
    
seq_length = 20
batch_size = 50

dataset = BookDataset(tokenized_id_text, seq_length)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle = True)

In [13]:
class CharacterLSTM(nn.Module):
    def __init__(self):
        super(CharacterLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=64)
        self.lstm = nn.LSTM(input_size=64, hidden_size=128, batch_first = True)
        self.linear = nn.Linear(128, vocab_size)

    def forward(self, x, states):
        x = self.embedding(x)
        out, states = self.lstm(x, states)
        out = self.linear(out)
        out = out.reshape(-1, out.size(2))
        return out, states
    
    def init_state(self, batch_size):
        hidden = torch.zeros(1, batch_size, 128)
        cell = torch.zeros(1, batch_size, 128)
        return hidden, cell
    
book_lstm = CharacterLSTM()

In [14]:
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(book_lstm.parameters(), lr = 0.01)

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    for features, labels in data_loader:
        optimizer.zero_grad()
        states = book_lstm.init_state(features.size(0))
        outputs, states = book_lstm(features, states)
        CEloss = loss(outputs, labels.view(-1))
        CEloss.backward()
        optimizer.step()

    print(f"Epoch: {epoch}/{num_epochs}, CE loss: {CEloss}")

Epoch: 0/30, CE loss: 1.6120966672897339
Epoch: 1/30, CE loss: 1.7034798860549927
Epoch: 2/30, CE loss: 1.6268138885498047
Epoch: 3/30, CE loss: 1.6464006900787354
Epoch: 4/30, CE loss: 1.666426181793213
Epoch: 5/30, CE loss: 1.6641098260879517
Epoch: 6/30, CE loss: 1.627328872680664
Epoch: 7/30, CE loss: 1.7594518661499023
Epoch: 8/30, CE loss: 1.6419677734375
Epoch: 9/30, CE loss: 1.6911346912384033
Epoch: 10/30, CE loss: 1.6986829042434692
Epoch: 11/30, CE loss: 1.6639600992202759
Epoch: 12/30, CE loss: 1.7090566158294678
Epoch: 13/30, CE loss: 1.7033449411392212
Epoch: 14/30, CE loss: 1.7034043073654175
Epoch: 15/30, CE loss: 1.683829426765442
Epoch: 16/30, CE loss: 1.6371747255325317
Epoch: 17/30, CE loss: 1.6812124252319336
Epoch: 18/30, CE loss: 1.7302536964416504
Epoch: 19/30, CE loss: 1.7205075025558472
Epoch: 20/30, CE loss: 1.6947753429412842
Epoch: 21/30, CE loss: 1.7100064754486084
Epoch: 22/30, CE loss: 1.7860043048858643
Epoch: 23/30, CE loss: 1.6208912134170532


KeyboardInterrupt: 

In [28]:
initial_prompt = "La gran sombra de el"
print(len(initial_prompt))

20


In [29]:
tokenized_id_prompt = torch.tensor([[char_to_id[ch] for ch in initial_prompt]])

def sample_with_temperature(logits, temperature=1.0):
    logits = logits / temperature
    probabilities = torch.softmax(logits, dim=-1)
    return torch.multinomial(probabilities, num_samples=1).item()

book_lstm.eval()
num_generated_chars = 300
with torch.no_grad():
    states = book_lstm.init_state(1)
    ## YOUR SOLUTION HERE ##
    for _ in range(num_generated_chars):
        output, states = book_lstm(tokenized_id_prompt, states)
        predicted_id = sample_with_temperature(output[-1], temperature=0.8)
        predicted_char = id_to_char[predicted_id]
        initial_prompt += predicted_char
        tokenized_id_prompt = torch.tensor([[predicted_id]])

In [30]:
print(initial_prompt)

La gran sombra de el caparda Grigoríﬁco nadánista de solo manos.
A él recuados, primero, con las fueron de la acon el con un etiene que
mando se por el cria con gatando contesta lo con un sacriﬁcado, morras. Es puede la hermina de
golpe de te ventara.
—No se hembra carne de la otra años en entes. Le mejor
Una despalabr
