# LSTM neural network trained on the book "Cadaver Exquisito" by Agustina Bazterrica

In [1]:
!pip3 install torch
!pip3 install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch 
import numpy as np
import torch.nn as nn
import fitz
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
torch.manual_seed(6)

<torch._C.Generator at 0x10c42e8d0>

In [3]:
tokenized_text = []
f = fitz.open("Cadaver_exquisito_Agustina_Maria_Bazterrica.pdf")
for page in f:
    tokenized_text += list(page.get_text())
tokenized_text = tokenized_text[:20000]
print(tokenized_text[:20])
print(len(tokenized_text))


['L', 'a', ' ', 's', 'ú', 'b', 'i', 't', 'a', ' ', 'a', 'p', 'a', 'r', 'i', 'c', 'i', 'ó', 'n', ' ']
20000


In [4]:
unique_tokens = sorted(list(set(tokenized_text)))
vocab_size = len(unique_tokens)
print(unique_tokens)
print(vocab_size)

['\n', ' ', ',', '.', '0', '1', '2', '3', '6', '7', '8', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '«', '»', '¿', 'Á', 'É', 'á', 'é', 'í', 'ñ', 'ó', 'ú', '—', '…', 'ﬁ', 'ﬂ']
77


In [5]:
char_to_id = {char: id for id, char in enumerate(unique_tokens)}
print(char_to_id)

{'\n': 0, ' ': 1, ',': 2, '.': 3, '0': 4, '1': 5, '2': 6, '3': 7, '6': 8, '7': 9, '8': 10, ':': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'Y': 35, 'Z': 36, 'a': 37, 'b': 38, 'c': 39, 'd': 40, 'e': 41, 'f': 42, 'g': 43, 'h': 44, 'i': 45, 'j': 46, 'k': 47, 'l': 48, 'm': 49, 'n': 50, 'o': 51, 'p': 52, 'q': 53, 'r': 54, 's': 55, 't': 56, 'u': 57, 'v': 58, 'x': 59, 'y': 60, 'z': 61, '«': 62, '»': 63, '¿': 64, 'Á': 65, 'É': 66, 'á': 67, 'é': 68, 'í': 69, 'ñ': 70, 'ó': 71, 'ú': 72, '—': 73, '…': 74, 'ﬁ': 75, 'ﬂ': 76}


In [6]:
id_to_char = {id: char for char, id in char_to_id.items()}
print(id_to_char)

{0: '\n', 1: ' ', 2: ',', 3: '.', 4: '0', 5: '1', 6: '2', 7: '3', 8: '6', 9: '7', 10: '8', 11: ':', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'Y', 36: 'Z', 37: 'a', 38: 'b', 39: 'c', 40: 'd', 41: 'e', 42: 'f', 43: 'g', 44: 'h', 45: 'i', 46: 'j', 47: 'k', 48: 'l', 49: 'm', 50: 'n', 51: 'o', 52: 'p', 53: 'q', 54: 'r', 55: 's', 56: 't', 57: 'u', 58: 'v', 59: 'x', 60: 'y', 61: 'z', 62: '«', 63: '»', 64: '¿', 65: 'Á', 66: 'É', 67: 'á', 68: 'é', 69: 'í', 70: 'ñ', 71: 'ó', 72: 'ú', 73: '—', 74: '…', 75: 'ﬁ', 76: 'ﬂ'}


In [7]:
tokenized_id_text = [char_to_id[i] for i in tokenized_text]
print(tokenized_id_text)

[24, 37, 1, 55, 72, 38, 45, 56, 37, 1, 37, 52, 37, 54, 45, 39, 45, 71, 50, 1, 40, 41, 1, 57, 50, 1, 58, 45, 54, 57, 55, 1, 48, 41, 56, 37, 48, 1, 53, 57, 41, 1, 37, 56, 37, 39, 37, 1, 37, 1, 48, 51, 55, 0, 37, 50, 45, 49, 37, 48, 41, 55, 1, 49, 51, 40, 45, 75, 39, 37, 1, 40, 41, 1, 49, 37, 50, 41, 54, 37, 1, 45, 54, 54, 41, 58, 41, 54, 55, 45, 38, 48, 41, 1, 41, 48, 1, 49, 57, 50, 40, 51, 11, 1, 40, 41, 55, 40, 41, 0, 48, 37, 55, 1, 75, 41, 54, 37, 55, 1, 44, 37, 55, 56, 37, 1, 48, 37, 55, 1, 49, 37, 55, 39, 51, 56, 37, 55, 1, 40, 41, 38, 41, 50, 1, 55, 41, 54, 1, 55, 45, 55, 56, 41, 49, 67, 56, 45, 39, 37, 49, 41, 50, 56, 41, 0, 55, 37, 39, 54, 45, 75, 39, 37, 40, 37, 55, 2, 1, 60, 1, 55, 57, 1, 39, 37, 54, 50, 41, 1, 60, 37, 1, 50, 51, 1, 52, 57, 41, 40, 41, 1, 55, 41, 54, 1, 39, 51, 50, 55, 57, 49, 45, 40, 37, 3, 1, 24, 51, 55, 0, 43, 51, 38, 45, 41, 54, 50, 51, 55, 1, 41, 50, 42, 54, 41, 50, 56, 37, 50, 1, 48, 37, 1, 55, 45, 56, 57, 37, 39, 45, 71, 50, 1, 39, 51, 50, 1, 57, 50, 37,

In [8]:
class BookDataset(Dataset):
    def __init__(self, tokenized_text, seq_length):
        self.tokenized_text = tokenized_text
        self.seq_length = seq_length

    def __len__(self):
        return len(self.tokenized_text) - self.seq_length
    
    def __getitem__(self, index):
        features = torch.tensor(self.tokenized_text[index:index + self.seq_length])
        labels = torch.tensor(self.tokenized_text[index + 1 : index + self.seq_length + 1])
        return features, labels
    
seq_length = 20
batch_size = 50

dataset = BookDataset(tokenized_id_text, seq_length)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle = True)

In [9]:
class CharacterLSTM(nn.Module):
    def __init__(self):
        super(CharacterLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=64)
        self.lstm = nn.LSTM(input_size=64, hidden_size=128, batch_first = True)
        self.linear = nn.Linear(128, vocab_size)

    def forward(self, x, states):
        x = self.embedding(x)
        out, states = self.lstm(x, states)
        out = self.linear(out)
        out = out.reshape(-1, out.size(2))
        return out, states
    
    def init_state(self, batch_size):
        hidden = torch.zeros(1, batch_size, 128)
        cell = torch.zeros(1, batch_size, 128)
        return hidden, cell
    
book_lstm = CharacterLSTM()

In [10]:
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(book_lstm.parameters(), lr = 0.0005)

In [11]:
num_epochs = 30
for epoch in range(num_epochs):
    for features, labels in data_loader:
        optimizer.zero_grad()
        states = book_lstm.init_state(features.size(0))
        outputs, states = book_lstm(features, states)
        CEloss = loss(outputs, labels.view(-1))
        CEloss.backward()
        optimizer.step()

    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], CELoss: {CEloss.item():.4f}')

Epoch [1/30], CELoss: 2.2728
Epoch [2/30], CELoss: 2.1101
Epoch [3/30], CELoss: 1.8421
Epoch [4/30], CELoss: 1.8919
Epoch [5/30], CELoss: 1.6957
Epoch [6/30], CELoss: 1.5223
Epoch [7/30], CELoss: 1.5372
Epoch [8/30], CELoss: 1.5085
Epoch [9/30], CELoss: 1.3888
Epoch [10/30], CELoss: 1.3007
Epoch [11/30], CELoss: 1.2957
Epoch [12/30], CELoss: 1.2419
Epoch [13/30], CELoss: 1.1747
Epoch [14/30], CELoss: 1.1298
Epoch [15/30], CELoss: 1.0872
Epoch [16/30], CELoss: 1.0341
Epoch [17/30], CELoss: 1.0575
Epoch [18/30], CELoss: 1.0051
Epoch [19/30], CELoss: 0.9406
Epoch [20/30], CELoss: 0.9435
Epoch [21/30], CELoss: 0.9300
Epoch [22/30], CELoss: 0.8576
Epoch [23/30], CELoss: 0.8814
Epoch [24/30], CELoss: 0.8829
Epoch [25/30], CELoss: 0.8214
Epoch [26/30], CELoss: 0.7860
Epoch [27/30], CELoss: 0.8313
Epoch [28/30], CELoss: 0.8283
Epoch [29/30], CELoss: 0.7722
Epoch [30/30], CELoss: 0.7676


In [21]:
initial_prompt = "La súbita aparición "
print(len(initial_prompt))

20


In [22]:
tokenized_id_prompt = torch.tensor([[char_to_id[ch] for ch in initial_prompt]])

def sample_with_temperature(logits, temperature=1.0):
    logits = logits / temperature
    probabilities = torch.softmax(logits, dim=-1)
    return torch.multinomial(probabilities, num_samples=1).item()

book_lstm.eval()
num_generated_chars = 300
with torch.no_grad():
    states = book_lstm.init_state(1)
    ## YOUR SOLUTION HERE ##
    for i in range(num_generated_chars):
        output, states = book_lstm(tokenized_id_prompt, states)
        
        predicted_id = torch.argmax(output[-1, :], dim=-1).item()

        predicted_char = id_to_char[predicted_id]
        initial_prompt += predicted_char
        tokenized_id_prompt = torch.tensor([[predicted_id]])

In [23]:
print(initial_prompt)

La súbita aparición de la piel, pero no puede descuando están los muertos que producto, que tiene que padre lo recuerdos están acostumbrados al descarne perdido eses
enconcierto y a la curtiembre y siente alimento está desprevenido. Se
despierta con una palabra que no puede descuartizados y asostumpleados le comen la s


In [19]:
tokenized_id_prompt = torch.tensor([[char_to_id[ch] for ch in initial_prompt]])

def sample_with_temperature(logits, temperature=1.0):
    logits = logits / temperature
    probabilities = torch.softmax(logits, dim=-1)
    return torch.multinomial(probabilities, num_samples=1).item()

book_lstm.eval()
num_generated_chars = 300
with torch.no_grad():
    states = book_lstm.init_state(1)
    ## YOUR SOLUTION HERE ##
    for i in range(num_generated_chars):
        output, states = book_lstm(tokenized_id_prompt, states)
        
        if(i%4):
            predicted_id = sample_with_temperature(output[-1], temperature=0.8)
        else:
            predicted_id = torch.argmax(output[-1, :], dim=-1).item()

        predicted_char = id_to_char[predicted_id]
        initial_prompt += predicted_char
        tokenized_id_prompt = torch.tensor([[predicted_id]])

In [20]:
print(initial_prompt)

La súbita aparición de la pobrebro que no se puede discursos y,
que las proteínas vegetal ainesta de la número de conque, en el recorrimar con pelo. Después ya se sechesaria la sice: «Yo le dos una después de la GGB el
mundo en el criadero Guerrero Iraola,
pero se minaron de
valz del discorque son comer y su cuando los
