In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -av '/content/drive/MyDrive/inteligencia2/segundoParcial/cuentos.txt' '/content/data.txt'

'/content/drive/MyDrive/inteligencia2/segundoParcial/cuentos.txt' -> '/content/data.txt'


In [3]:
f = open("data.txt", "r", encoding='utf-8')
text = f.read()
text[:300], len(text)

('El desayuno de Laura\nA las ocho de la mañana la mamá de Laura ya se ha tomado su café con tostadas. Es hora de despertar a su hija o se hará tarde. Casi a oscuras, se acerca a la pequeña cama de madera y busca su carita bajo el edredón para darle un beso de buenos días.\nLaura se despereza, se pone s',
 80375)

In [17]:
import string

all_characters = '0123456789abcdefghijlmnopqrstuvyABCDEFGHIJLMNOPRSTUVY \n'
all_characters

'0123456789abcdefghijlmnopqrstuvyABCDEFGHIJLMNOPRSTUVY \n'

In [18]:
import string

all_characters = string.printable + "ñÑáÁéÉíÍóÓúÚ¿¡"
all_characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0cñÑáÁéÉíÍóÓúÚ¿¡'

In [19]:
import string

class Tokenizer(): 
    
  def __init__(self):
    self.all_characters = all_characters
    self.n_characters = len(self.all_characters)
    
  def text_to_seq(self, string):
    seq = []
    for c in range(len(string)):
        try:
            seq.append(self.all_characters.index(string[c]))
        except:
            continue
    return seq

  def seq_to_text(self, seq):
    text = ''
    for c in range(len(seq)):
        text += self.all_characters[seq[c]]
    return text

tokenizer = Tokenizer()
tokenizer.n_characters

114

In [20]:
tokenizer.text_to_seq('fin')

[15, 18, 23]

In [21]:
tokenizer.seq_to_text([15, 18, 23])

'fin'

In [22]:
text_encoded = tokenizer.text_to_seq(text)

In [23]:
train_size = len(text_encoded) * 80 // 100 
train = text_encoded[:train_size]
test = text_encoded[train_size:]

len(train), len(test)

(64167, 16042)

In [24]:
import random

def windows(text, window_size = 100):
    start_index = 0
    end_index = len(text) - window_size
    text_windows = []
    while start_index < end_index:
      text_windows.append(text[start_index:start_index+window_size+1])
      start_index += 1
    return text_windows

text_encoded_windows = windows(text_encoded)

In [25]:
print(tokenizer.seq_to_text((text_encoded_windows[0])))
print()
print(tokenizer.seq_to_text((text_encoded_windows[1])))
print()
print(tokenizer.seq_to_text((text_encoded_windows[2])))

El desayuno de Laura
A las ocho de la mañana la mamá de Laura ya se ha tomado su café con tostadas. E

l desayuno de Laura
A las ocho de la mañana la mamá de Laura ya se ha tomado su café con tostadas. Es

 desayuno de Laura
A las ocho de la mañana la mamá de Laura ya se ha tomado su café con tostadas. Es 


In [26]:
import torch

class CharRNNDataset(torch.utils.data.Dataset):
  def __init__(self, text_encoded_windows, train=True):
    self.text = text_encoded_windows
    self.train = train

  def __len__(self):
    return len(self.text)

  def __getitem__(self, ix):
    if self.train:
      return torch.tensor(self.text[ix][:-1]), torch.tensor(self.text[ix][-1])
    return torch.tensor(self.text[ix])

In [27]:
train_text_encoded_windows = windows(train)
test_text_encoded_windows = windows(test)

dataset = {
    'train': CharRNNDataset(train_text_encoded_windows),
    'val': CharRNNDataset(test_text_encoded_windows)
}

dataloader = {
    'train': torch.utils.data.DataLoader(dataset['train'], batch_size=512, shuffle=True, pin_memory=True),
    'val': torch.utils.data.DataLoader(dataset['val'], batch_size=2048, shuffle=False, pin_memory=True),
}

len(dataset['train']), len(dataset['val'])

(64067, 15942)

In [29]:
input, output = dataset['train'][0]
tokenizer.seq_to_text(input)

'El desayuno de Laura\nA las ocho de la mañana la mamá de Laura ya se ha tomado su café con tostadas. '

In [30]:
tokenizer.seq_to_text([output])

'E'

In [31]:
class CharRNN(torch.nn.Module):
  def __init__(self, input_size, embedding_size=128, hidden_size=256, num_layers=3, dropout=0.2):
    super().__init__()
    self.encoder = torch.nn.Embedding(input_size, embedding_size)
    self.rnn = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
    self.fc = torch.nn.Linear(hidden_size, input_size)

  def forward(self, x):
    x = self.encoder(x)
    x, h = self.rnn(x)         
    y = self.fc(x[:,-1,:])
    return y

In [32]:
model = CharRNN(input_size=tokenizer.n_characters)
outputs = model(torch.randint(0, tokenizer.n_characters, (64, 50)))
outputs.shape

torch.Size([64, 114])

In [33]:
from tqdm import tqdm
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

def fit(model, dataloader, epochs=10):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(1, epochs+1):
        model.train()
        train_loss = []
        bar = tqdm(dataloader['train'])
        for batch in bar:
            X, y = batch
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            y_hat = model(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            bar.set_description(f"loss {np.mean(train_loss):.5f}")
        bar = tqdm(dataloader['val'])
        val_loss = []
        model.eval()
        with torch.no_grad():
            for batch in bar:
                X, y = batch
                X, y = X.to(device), y.to(device)
                y_hat = model(X)
                loss = criterion(y_hat, y)
                val_loss.append(loss.item())
                bar.set_description(f"val_loss {np.mean(val_loss):.5f}")
        print(f"Epoch {epoch}/{epochs} loss {np.mean(train_loss):.5f} val_loss {np.mean(val_loss):.5f}")

def predict(model, X):
    model.eval() 
    with torch.no_grad():
        X = torch.tensor(X).to(device)
        pred = model(X.unsqueeze(0))
        return pred

In [34]:
model = CharRNN(input_size=tokenizer.n_characters)
fit(model, dataloader, epochs=30)

loss 3.12809: 100%|██████████| 126/126 [00:18<00:00,  6.98it/s]
val_loss 3.14572: 100%|██████████| 8/8 [00:01<00:00,  5.28it/s]


Epoch 1/30 loss 3.12809 val_loss 3.14572


loss 2.78741: 100%|██████████| 126/126 [00:18<00:00,  6.86it/s]
val_loss 2.56107: 100%|██████████| 8/8 [00:01<00:00,  5.22it/s]


Epoch 2/30 loss 2.78741 val_loss 2.56107


loss 2.36739: 100%|██████████| 126/126 [00:19<00:00,  6.59it/s]
val_loss 2.40942: 100%|██████████| 8/8 [00:01<00:00,  5.07it/s]


Epoch 3/30 loss 2.36739 val_loss 2.40942


loss 2.22052: 100%|██████████| 126/126 [00:19<00:00,  6.39it/s]
val_loss 2.26100: 100%|██████████| 8/8 [00:01<00:00,  4.93it/s]


Epoch 4/30 loss 2.22052 val_loss 2.26100


loss 2.11505: 100%|██████████| 126/126 [00:20<00:00,  6.25it/s]
val_loss 2.17516: 100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


Epoch 5/30 loss 2.11505 val_loss 2.17516


loss 2.03642: 100%|██████████| 126/126 [00:20<00:00,  6.15it/s]
val_loss 2.11420: 100%|██████████| 8/8 [00:01<00:00,  4.41it/s]


Epoch 6/30 loss 2.03642 val_loss 2.11420


loss 1.97136: 100%|██████████| 126/126 [00:20<00:00,  6.28it/s]
val_loss 2.06971: 100%|██████████| 8/8 [00:01<00:00,  4.92it/s]


Epoch 7/30 loss 1.97136 val_loss 2.06971


loss 1.91414: 100%|██████████| 126/126 [00:20<00:00,  6.30it/s]
val_loss 2.02717: 100%|██████████| 8/8 [00:01<00:00,  4.44it/s]


Epoch 8/30 loss 1.91414 val_loss 2.02717


loss 1.86647: 100%|██████████| 126/126 [00:20<00:00,  6.25it/s]
val_loss 1.99216: 100%|██████████| 8/8 [00:01<00:00,  4.84it/s]


Epoch 9/30 loss 1.86647 val_loss 1.99216


loss 1.82188: 100%|██████████| 126/126 [00:20<00:00,  6.20it/s]
val_loss 1.96462: 100%|██████████| 8/8 [00:01<00:00,  4.86it/s]


Epoch 10/30 loss 1.82188 val_loss 1.96462


loss 1.77419: 100%|██████████| 126/126 [00:20<00:00,  6.27it/s]
val_loss 1.95673: 100%|██████████| 8/8 [00:01<00:00,  4.48it/s]


Epoch 11/30 loss 1.77419 val_loss 1.95673


loss 1.73782: 100%|██████████| 126/126 [00:20<00:00,  6.28it/s]
val_loss 1.93052: 100%|██████████| 8/8 [00:01<00:00,  4.86it/s]


Epoch 12/30 loss 1.73782 val_loss 1.93052


loss 1.69950: 100%|██████████| 126/126 [00:20<00:00,  6.20it/s]
val_loss 1.90215: 100%|██████████| 8/8 [00:01<00:00,  4.89it/s]


Epoch 13/30 loss 1.69950 val_loss 1.90215


loss 1.66300: 100%|██████████| 126/126 [00:20<00:00,  6.22it/s]
val_loss 1.88532: 100%|██████████| 8/8 [00:01<00:00,  4.87it/s]


Epoch 14/30 loss 1.66300 val_loss 1.88532


loss 1.63294: 100%|██████████| 126/126 [00:20<00:00,  6.20it/s]
val_loss 1.87641: 100%|██████████| 8/8 [00:01<00:00,  4.86it/s]


Epoch 15/30 loss 1.63294 val_loss 1.87641


loss 1.59798: 100%|██████████| 126/126 [00:20<00:00,  6.26it/s]
val_loss 1.85702: 100%|██████████| 8/8 [00:01<00:00,  4.85it/s]


Epoch 16/30 loss 1.59798 val_loss 1.85702


loss 1.56585: 100%|██████████| 126/126 [00:20<00:00,  6.25it/s]
val_loss 1.84840: 100%|██████████| 8/8 [00:01<00:00,  4.19it/s]


Epoch 17/30 loss 1.56585 val_loss 1.84840


loss 1.53594: 100%|██████████| 126/126 [00:20<00:00,  6.24it/s]
val_loss 1.84239: 100%|██████████| 8/8 [00:01<00:00,  4.84it/s]


Epoch 18/30 loss 1.53594 val_loss 1.84239


loss 1.50861: 100%|██████████| 126/126 [00:20<00:00,  6.20it/s]
val_loss 1.84223: 100%|██████████| 8/8 [00:01<00:00,  4.88it/s]


Epoch 19/30 loss 1.50861 val_loss 1.84223


loss 1.48450: 100%|██████████| 126/126 [00:20<00:00,  6.27it/s]
val_loss 1.82842: 100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


Epoch 20/30 loss 1.48450 val_loss 1.82842


loss 1.45961: 100%|██████████| 126/126 [00:20<00:00,  6.27it/s]
val_loss 1.83265: 100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


Epoch 21/30 loss 1.45961 val_loss 1.83265


loss 1.43626: 100%|██████████| 126/126 [00:20<00:00,  6.28it/s]
val_loss 1.82130: 100%|██████████| 8/8 [00:01<00:00,  4.85it/s]


Epoch 22/30 loss 1.43626 val_loss 1.82130


loss 1.41187: 100%|██████████| 126/126 [00:20<00:00,  6.22it/s]
val_loss 1.81071: 100%|██████████| 8/8 [00:01<00:00,  4.86it/s]


Epoch 23/30 loss 1.41187 val_loss 1.81071


loss 1.38968: 100%|██████████| 126/126 [00:20<00:00,  6.27it/s]
val_loss 1.82780: 100%|██████████| 8/8 [00:01<00:00,  4.39it/s]


Epoch 24/30 loss 1.38968 val_loss 1.82780


loss 1.36753: 100%|██████████| 126/126 [00:20<00:00,  6.28it/s]
val_loss 1.81669: 100%|██████████| 8/8 [00:01<00:00,  4.89it/s]


Epoch 25/30 loss 1.36753 val_loss 1.81669


loss 1.34916: 100%|██████████| 126/126 [00:20<00:00,  6.22it/s]
val_loss 1.81267: 100%|██████████| 8/8 [00:01<00:00,  4.83it/s]


Epoch 26/30 loss 1.34916 val_loss 1.81267


loss 1.32751: 100%|██████████| 126/126 [00:20<00:00,  6.29it/s]
val_loss 1.81420: 100%|██████████| 8/8 [00:01<00:00,  4.88it/s]


Epoch 27/30 loss 1.32751 val_loss 1.81420


loss 1.30651: 100%|██████████| 126/126 [00:20<00:00,  6.22it/s]
val_loss 1.81741: 100%|██████████| 8/8 [00:01<00:00,  4.79it/s]


Epoch 28/30 loss 1.30651 val_loss 1.81741


loss 1.28990: 100%|██████████| 126/126 [00:20<00:00,  6.16it/s]
val_loss 1.82111: 100%|██████████| 8/8 [00:01<00:00,  4.85it/s]


Epoch 29/30 loss 1.28990 val_loss 1.82111


loss 1.26859: 100%|██████████| 126/126 [00:20<00:00,  6.30it/s]
val_loss 1.81839: 100%|██████████| 8/8 [00:01<00:00,  4.41it/s]

Epoch 30/30 loss 1.26859 val_loss 1.81839





In [35]:
X_new = "eraze una vez"
X_new_encoded = tokenizer.text_to_seq(X_new)
y_pred = predict(model, X_new_encoded)
y_pred = torch.argmax(y_pred, axis=1)[0].item()
tokenizer.seq_to_text([y_pred])

' '

In [36]:
temp=1
for i in range(1000):
  X_new_encoded = tokenizer.text_to_seq(X_new[-100:])
  y_pred = predict(model, X_new_encoded)
  y_pred = y_pred.view(-1).div(temp).exp()
  top_i = torch.multinomial(y_pred, 1)[0]
  predicted_char = tokenizer.all_characters[top_i]
  X_new += predicted_char

print(X_new)

eraze una vez candero del puercoza de tu mentido días y la preparablen opes.
Dendre de sus casa duerde quíullaso
Si oz, gusta y tanto que se todas la madres que sea que llegar y se puso desearmir y ciullás todo y recipionado, al obre llamado, llego con fuerte! Cuando el cuento al Avilno Cerito, profundo que se muñve. Si paba se llono había citaban tan conersente, un papa tracias que los pojor una noble al de que nos lo expertente entanla a la respondió en la grampaga Naparita a béczar le había quería comparten tan ayuda.
Yo este píbnto si puedes cultidad de ocalor, hermosa día tiempo, prajumtar a su solorín conajo y Pero no era una partipa, peraos y sabes el tosques a la lleto para fueras y dos muy muchos porque nos alomajos en su papa de transfordad de su salva, pero ban su paboque, y pues Mabrando.
Madora se mucho cosas porque es caquitor fuel y pompor callé: Si cabajado es muy buen, para cieron muy muy conperterana, los zapatillas en la jufaba peraomrocha, se nuego es una calambrida