In [None]:
import numpy as np
from nltk import ngrams
from collections import Counter, defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
CORPORA_DIR = "drive/MyDrive/Quijote.txt"

corpus_sent = []
with open(CORPORA_DIR, 'r') as file:
    for line in file:
        corpus_sent.append(line)

In [None]:
corpus_sent[:10]

['\ufeffThe Project Gutenberg eBook of Don Quijote\n',
 '    \n',
 'This ebook is for the use of anyone anywhere in the United States and\n',
 'most other parts of the world at no cost and with almost no restrictions\n',
 'whatsoever. You may copy it, give it away or re-use it under the terms\n',
 'of the Project Gutenberg License included with this ebook or online\n',
 'at www.gutenberg.org. If you are not located in the United States,\n',
 'you will have to check the laws of the country where you are located\n',
 'before using this eBook.\n',
 '\n']

#Preprocesamiento
Queremos quitar la mayor parte de líneas en blanco y el header que está en inglés

In [None]:
def clean_corpus(sent_corpus : list[str]):
  '''
  Quitamos los espacios en blanco y líneas que solo es \n
  '''
  clean = []
  for sent in sent_corpus:
    if ((sent != '\n') & (sent != '  \n')):
      clean.append(sent)

  return clean

In [None]:
corpus = clean_corpus(corpus_sent)
corpus[:16]

['\ufeffThe Project Gutenberg eBook of Don Quijote\n',
 '    \n',
 'This ebook is for the use of anyone anywhere in the United States and\n',
 'most other parts of the world at no cost and with almost no restrictions\n',
 'whatsoever. You may copy it, give it away or re-use it under the terms\n',
 'of the Project Gutenberg License included with this ebook or online\n',
 'at www.gutenberg.org. If you are not located in the United States,\n',
 'you will have to check the laws of the country where you are located\n',
 'before using this eBook.\n',
 'Title: Don Quijote\n',
 'Author: Miguel de Cervantes Saavedra\n',
 'Release date: December 1, 1999 [eBook #2000]\n',
 '                Most recently updated: January 17, 2021\n',
 'Language: Spanish\n',
 'Credits: an anonymous Project Gutenberg volunteer and Joaquin Cuenca Abela\n',
 '*** START OF THE PROJECT GUTENBERG EBOOK DON QUIJOTE ***\n']

In [None]:
indexes = list(range(16))
quijote_sents = np.delete(corpus, indexes)
quijote_sents[:10]
##Quitamos el header del documento para trabajar sólo con el texto del quijote

array(['El ingenioso hidalgo don Quijote de la Mancha\n',
       'por Miguel de Cervantes Saavedra\n',
       'El ingenioso hidalgo don Quijote de la Mancha\n', 'Tasa\n',
       'Testimonio de las erratas\n', 'El Rey\n', 'Al Duque de Béjar\n',
       'Prólogo\n', 'Al libro de don Quijote de la Mancha\n',
       'Que trata de la condición y ejercicio del famoso\n'], dtype='<U79')

In [None]:
def sentences_to_words(sent_corpus : list[str]) -> list[str]:
  new_corpus = []
  for sent in sent_corpus:
    new_corpus.append([word.strip() for word in sent.split(' ')])

  return new_corpus

In [None]:
quijote_corpus = sentences_to_words(quijote_sents)

In [None]:
quijote_corpus[:10]

[['El', 'ingenioso', 'hidalgo', 'don', 'Quijote', 'de', 'la', 'Mancha'],
 ['por', 'Miguel', 'de', 'Cervantes', 'Saavedra'],
 ['El', 'ingenioso', 'hidalgo', 'don', 'Quijote', 'de', 'la', 'Mancha'],
 ['Tasa'],
 ['Testimonio', 'de', 'las', 'erratas'],
 ['El', 'Rey'],
 ['Al', 'Duque', 'de', 'Béjar'],
 ['Prólogo'],
 ['Al', 'libro', 'de', 'don', 'Quijote', 'de', 'la', 'Mancha'],
 ['Que', 'trata', 'de', 'la', 'condición', 'y', 'ejercicio', 'del', 'famoso']]

Además, tenemos que agregar tokens EOS, BS y normalizar a minúsculas

In [None]:
import re
def preprocess_corpus(corpus: list[str]) -> list[str]:
    """Función de preprocesamiento

    Agrega tokens de inicio y fin, normaliza todo a minusculas
    """
    preprocessed_corpus = []
    for sent in corpus:
        result = [word.lower() for word in sent]
        # Al final de la oración
        result.append("<EOS>")
        result.insert(0, "<BOS>")
        preprocessed_corpus.append(result)
    return preprocessed_corpus

In [None]:
def get_words_freqs(corpus: list[list[str]]):
    words_freqs = {}
    for sentence in corpus:
        for word in sentence:
            words_freqs[word] = words_freqs.get(word, 0) + 1
    return words_freqs

In [None]:
UNK_LABEL = "<UNK>"
def get_words_indexes(words_freqs: dict) -> dict:
    result = {}
    for idx, word in enumerate(words_freqs.keys()):
        # Happax legomena happends
        if words_freqs[word] == 1:
            # Temp index for unknowns
            result[UNK_LABEL] = len(words_freqs)
        else:
            result[word] = idx

    return {word: idx for idx, word in enumerate(result.keys())}, {idx: word for idx, word in enumerate(result.keys())}

In [None]:
quijote_processed = preprocess_corpus(quijote_corpus)

In [None]:
quijote_processed[:10]

[['<BOS>',
  'el',
  'ingenioso',
  'hidalgo',
  'don',
  'quijote',
  'de',
  'la',
  'mancha',
  '<EOS>'],
 ['<BOS>', 'por', 'miguel', 'de', 'cervantes', 'saavedra', '<EOS>'],
 ['<BOS>',
  'el',
  'ingenioso',
  'hidalgo',
  'don',
  'quijote',
  'de',
  'la',
  'mancha',
  '<EOS>'],
 ['<BOS>', 'tasa', '<EOS>'],
 ['<BOS>', 'testimonio', 'de', 'las', 'erratas', '<EOS>'],
 ['<BOS>', 'el', 'rey', '<EOS>'],
 ['<BOS>', 'al', 'duque', 'de', 'béjar', '<EOS>'],
 ['<BOS>', 'prólogo', '<EOS>'],
 ['<BOS>',
  'al',
  'libro',
  'de',
  'don',
  'quijote',
  'de',
  'la',
  'mancha',
  '<EOS>'],
 ['<BOS>',
  'que',
  'trata',
  'de',
  'la',
  'condición',
  'y',
  'ejercicio',
  'del',
  'famoso',
  '<EOS>']]

In [None]:
len(quijote_processed)

32229

In [None]:
words_freqs = get_words_freqs(quijote_processed)

In [None]:
len(words_freqs)

38106

In [None]:
words_freqs["el"]

8254

In [None]:
count = 0
for word, freq in words_freqs.items():
    if freq == 1 and count <= 10:
        print(word, freq)
        count += 1

saavedra 1
papel; 1
conste, 1
deciembre 1
andrada. 1
correcto, 1
fee. 1
diciembre 1
1604 1
pedistes 1
previlegio 1


In [None]:
words_indexes, index_to_word = get_words_indexes(words_freqs)

In [None]:
words_indexes["el"]

1

In [None]:
index_to_word[1]

'el'

In [None]:
len(words_indexes)

16036

In [None]:
len(index_to_word)

16036

In [None]:
def get_word_id(words_indexes: dict, word: str) -> int:
    unk_word_id = words_indexes[UNK_LABEL]
    return words_indexes.get(word, unk_word_id)

#Obtenemos trigramas

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

In [None]:
def get_train_test_data(corpus: list[list[str]], words_indexes: dict, n: int) -> tuple[list, list]:
    x_train = []
    y_train = []
    for sent in corpus:
        n_grams = ngrams(sent, n)
        for w1, w2, w3 in n_grams:
            x_train.append([get_word_id(words_indexes, w1), get_word_id(words_indexes, w2)])
            y_train.append([get_word_id(words_indexes, w3)])
    return x_train, y_train

In [None]:
# Setup de parametros
EMBEDDING_DIM = 200
CONTEXT_SIZE = 2
BATCH_SIZE = 256
H = 100
torch.manual_seed(19)
# Tamaño del Vocabulario
V = len(words_indexes)

In [None]:
x_train, y_train = get_train_test_data(quijote_processed, words_indexes, n=3)

In [None]:
train_set = np.concatenate((x_train, y_train), axis=1)
# partimos los datos de entrada en batches
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE)

In [None]:
# Trigram Neural Network Model
class TrigramModel(nn.Module):
    """Clase padre: https://pytorch.org/docs/stable/generated/torch.nn.Module.html"""

    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(TrigramModel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        self.linear2 = nn.Linear(h, vocab_size)

    def forward(self, inputs):
        # x': concatenation of x1 and x2 embeddings   -->
        #self.embeddings regresa un vector por cada uno de los índices que se les pase como entrada. view() les cambia el tamaño para concatenarlos
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # h: tanh(W_1.x' + b)  -->
        out = torch.tanh(self.linear1(embeds))
        # W_2.h                 -->
        out = self.linear2(out)
        # log_softmax(W_2.h)      -->
        # dim=1 para que opere sobre renglones, pues al usar batchs tenemos varios vectores de salida
        log_probs = F.log_softmax(out, dim=1)

        return log_probs

##Entrenamiento

In [None]:
# 1. Pérdida. Negative log-likelihood loss
loss_function = nn.NLLLoss()

#Otras opciones de función de pérdida (tendrían que usar softmax sin log):
#nn.CrossEntropyLoss()


# 2. Instanciar el modelo
model = TrigramModel(V, EMBEDDING_DIM, CONTEXT_SIZE, H)

# 3. Optimización. ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = 2e-3)

#Otras opciones de optimizador:
#optimizer = optim.SGD(model.parameters(), lr=0.1)


# ------------------------- TRAIN & SAVE MODEL ------------------------
# En la práctica sólo correremos una epoch por restricciones de recursos
EPOCHS = 1
for epoch in range(EPOCHS):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:2]
        target_tensor = data_tensor[:,2]

        model.zero_grad() #reinicializar los gradientes
        #FORWARD:
        # get log probabilities over next words
        log_probs = model(context_tensor)


        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        #BACKWARD:
        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0:
            print("Training Iteration {} of epoch {} complete. Loss: {}; Time taken (s): {}".format(it, epoch, loss.item(), (time.time()-st)))
            st = time.time()
            #barch_size x len(vocab)

    # saving model
    model_path = "drive/MyDrive/" + 'model_{}.dat'.format(epoch)
    torch.save(model.state_dict(), model_path)
    print(f"Model saved for epoch={epoch} at {model_path}")


--- Training model Epoch: 0 ---
Training Iteration 0 of epoch 0 complete. Loss: 9.738958358764648; Time taken (s): 0.4195072650909424
Training Iteration 500 of epoch 0 complete. Loss: 5.58427619934082; Time taken (s): 70.41594457626343
Training Iteration 1000 of epoch 0 complete. Loss: 5.266488075256348; Time taken (s): 67.49588203430176
Training Iteration 1500 of epoch 0 complete. Loss: 5.469616889953613; Time taken (s): 69.2159628868103
Model saved for epoch=0 at drive/MyDrive/model_0.dat


In [None]:
model

TrigramModel(
  (embeddings): Embedding(16036, 200)
  (linear1): Linear(in_features=400, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=16036, bias=True)
)

In [None]:
def get_model(path: str) -> TrigramModel:
    model_loaded = TrigramModel(V, EMBEDDING_DIM, CONTEXT_SIZE, H)
    model_loaded.load_state_dict(torch.load(path))
    model_loaded.eval()
    return model_loaded

In [None]:
PATH = "drive/MyDrive/model_0.dat"

In [None]:
model = get_model(PATH)
W1 = "<BOS>"
W2 = "my"

IDX1 = get_word_id(words_indexes, W1)
IDX2 = get_word_id(words_indexes, W2)

#Obtenemos Log probabidades p(W3|W2,W1)
probs = model(torch.tensor([[IDX1,  IDX2]])).detach().tolist()

In [None]:
len(probs[0])

16036

In [None]:
# Creamos diccionario con {idx: logprob}
model_probs = {}
for idx, p in enumerate(probs[0]):
  model_probs[idx] = p

# Sort:
model_probs_sorted = sorted(((prob, idx) for idx, prob in model_probs.items()), reverse=True)

# Printing word  and prob (retrieving the idx):
topcandidates = 0
for prob, idx in model_probs_sorted:
  #Retrieve the word associated with that idx
  word = index_to_word[idx]
  print(idx, word, prob)

  topcandidates += 1

  if topcandidates > 100:
    break

15787 to -3.857161045074463
12331 in -3.87473201751709
15828 or -4.590223789215088
15747 of -4.617570400238037
15801 you -4.634593963623047
15748 the -4.770502090454102
15800 if -5.2651519775390625
15775 and -5.4163360595703125
15818 such -5.532860279083252
15619 llevadme -6.096963882446289
13102 ¡cuerpo -6.125380039215088
15003 date -6.126634120941162
15906 format -6.1752610206604
15949 marked -6.261355876922607
15720 data, -6.407069206237793
15545 atropellando -6.4084672927856445
15767 owns -6.446159839630127
15758 from -6.4531331062316895
15480 ricote -6.4932074546813965
13684 déjenme -6.506566524505615
15781 set -6.545663833618164
13890 nueces, -6.545879364013672
15568 pastoras -6.577900409698486
14243 viniéndosele -6.585544586181641
14112 justas, -6.604029655456543
15347 parecíale -6.609562873840332
15766 that -6.615713596343994
15858 your -6.637983798980713
14298 callaron -6.638277053833008
15259 tocaron -6.662238597869873
14992 ¡par -6.66354513168335
14460 volví -6.6859717369079

In [None]:
index_to_word.get(model_probs_sorted[0][0])

Liga al [modelo](https://drive.google.com/file/d/1-24lp3GYh-HqMuhRpAYrla-F3V03o0Sj/view?usp=drive_link) en Drive

#Generación de lenguaje

In [None]:
def get_likely_words(model: TrigramModel, context: str, words_indexes: dict, index_to_word: dict, top_count: int=10) -> list[tuple]:
    model_probs = {}
    words = context.split()
    idx_word_1 = get_word_id(words_indexes, words[0])
    idx_word_2 = get_word_id(words_indexes, words[1])
    probs = model(torch.tensor([[idx_word_1, idx_word_2]])).detach().tolist()

    for idx, p in enumerate(probs[0]):
        model_probs[idx] = p

    # Strategy: Sort and get top-K words to generate text
    return sorted(((prob, index_to_word[idx]) for idx, prob in model_probs.items()), reverse=True)[:top_count]

In [None]:
sentence = "fuimos a"
get_likely_words(model, sentence, words_indexes, index_to_word, 3)

[(-4.023273944854736, 'barcelona,'),
 (-4.595625877380371, 'verle,'),
 (-4.597508907318115, 'ti')]

In [None]:
sentence1 = "con el"
get_likely_words(model, sentence, words_indexes, index_to_word, 3)

[(-4.023273944854736, 'barcelona,'),
 (-4.595625877380371, 'verle,'),
 (-4.597508907318115, 'ti')]

In [None]:
sentence2 = "vimos un"
get_likely_words(model, sentence, words_indexes, index_to_word, 3)

[(-4.023273944854736, 'barcelona,'),
 (-4.595625877380371, 'verle,'),
 (-4.597508907318115, 'ti')]

In [None]:
from random import randint

def get_next_word(words: list[tuple[float, str]]) -> str:
    # From a top-K list of words get a random word
    return words[randint(0, len(words)-1)][1]

In [None]:
get_next_word(get_likely_words(model, sentence2, words_indexes, index_to_word))

'buen'

In [None]:
MAX_TOKENS = 30
TOP_COUNT = 10
def generate_text(model: TrigramModel, history: str, words_indexes: dict, index_to_word: dict, tokens_count: int=0) -> None:
    next_word = get_next_word(get_likely_words(model, history, words_indexes, index_to_word, top_count=TOP_COUNT))
    print(next_word, end=" ")
    tokens_count += 1
    if tokens_count == MAX_TOKENS or next_word == "<EOS>":
        return
    generate_text(model, history.split()[1]+ " " + next_word, words_indexes, index_to_word, tokens_count)

In [None]:
sent = "<BOS> fue"
print(sent, end=" ")
generate_text(model, sent, words_indexes, index_to_word)

<BOS> fue gracioso, con los duques <UNK> of puestas, manera: <EOS> 

In [None]:
sentence = "<BOS> en"
print(sentence, end=" ")
generate_text(model, sentence, words_indexes, index_to_word)

<BOS> en ala, <UNK> ricote you may mía! que <UNK> yendo pues, <EOS> 

In [None]:
sentence = "<BOS> no"
print(sentence, end=" ")
generate_text(model, sentence, words_indexes, index_to_word)

<BOS> no confesáis posible que <EOS> 

In [None]:
sentence = "ese fue"
print(sentence, end=" ")
generate_text(model, sentence, words_indexes, index_to_word)

ese fue que en la lengua figura, que, pues me ha querido de lo de <UNK> in félix mismo, y en el aposento don quijote; que, puesto de que yo pienso haber 

#Embeddings

In [None]:
word = input(">> ")
words_tensor = torch.LongTensor([get_word_id(words_indexes, word)])
word_embed = model.embeddings(words_tensor)
print(f"embbeding (dim={len(word_embed[0])}) vec for word={word}")
word_embed

>> dijistes
embbeding (dim=200) vec for word=dijistes


tensor([[ 0.7049, -0.8217, -1.4046, -0.4242, -0.3843,  1.2171,  2.0635,  1.0503,
         -1.0077, -0.5408,  1.3352,  0.1515,  1.3394, -0.6162,  0.0187,  0.4192,
          0.8145,  0.7169, -0.4935, -1.0501, -0.5550,  0.1048,  2.4219,  0.6141,
          0.1410,  0.6613,  0.1001,  0.1726, -0.0041,  0.1390, -0.0734, -1.3170,
         -0.1972, -0.7047, -1.1203,  1.3664, -0.2810, -0.0114,  0.6651,  0.4549,
         -0.8410, -0.4305,  0.3905, -0.3211, -0.7807, -0.7998,  0.7703,  0.6948,
         -0.1618,  0.9177,  1.1669, -0.7677, -0.4478,  0.6371,  1.3727,  0.7251,
          0.8919,  1.0914, -1.0731, -0.0081, -0.7819, -0.3320,  0.5162, -1.1513,
          1.7838,  0.0222,  0.4287,  1.0062,  0.2067, -0.5762, -0.0311,  0.8352,
          1.1277,  0.6921,  0.3941, -0.4135,  1.6018,  1.3549,  0.0283, -0.4879,
         -0.1269, -0.2917, -0.2915, -0.4840, -0.3493, -0.8144,  1.2907,  1.5847,
          0.8793,  1.0309,  0.6827,  0.8596, -0.8594, -0.6392, -0.9663,  0.9392,
         -0.4985, -0.1583,  

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Obtenemos embedding para una lista de palabras
words = ["el", "avanzó", "día", "ayer"]
for word in words:
    base_word_embed = model.embeddings(torch.LongTensor([get_word_id(words_indexes, word)])).detach().numpy()

    # Calcula la similitud del coseno con todas las otras palabras
    word_sims = {}
    for other_word in words_indexes.keys():
        if word == other_word:
            continue
        other_word_embed = model.embeddings(torch.LongTensor([get_word_id(words_indexes, other_word)])).detach().numpy()
        word_sims[other_word] = cosine_similarity(base_word_embed, other_word_embed)

    # Imprime las 10 palabras más similares
    print("\nBASE WORD =", word)
    for word, sim in sorted(word_sims.items(), key=lambda item: item[1], reverse=True)[:10]:
        print(f"{word}: {sim[0][0]}")


BASE WORD = el
desvalijando: 0.28377223014831543
pequeño: 0.2815472483634949
sueños: 0.2502957284450531
tirador: 0.24889090657234192
actores: 0.24482297897338867
jaula: 0.24098268151283264
vivaldo: 0.23634451627731323
son.: 0.23514550924301147
llámase: 0.23087884485721588
esposo: 0.2296067178249359

BASE WORD = avanzó
<UNK>: 1.0
antojo: 0.25950002670288086
quebrantado,: 0.24768158793449402
balcón,: 0.237320676445961
personas:: 0.237098827958107
¿esta: 0.23624125123023987
aventura,: 0.2350798398256302
hombro: 0.23226578533649445
ciudad;: 0.22861920297145844
gana,: 0.22807221114635468

BASE WORD = día
conocer: 0.3051227629184723
letu-,: 0.26607033610343933
manda.: 0.25617092847824097
acabar.: 0.2509163022041321
castellana,: 0.25009575486183167
emperatriz,: 0.24074670672416687
pulsos: 0.23711787164211273
zoraida.: 0.23668810725212097
trataba,: 0.23599544167518616
llenas.: 0.23574165999889374

BASE WORD = ayer
engañado,: 0.28074684739112854
flores,: 0.26050686836242676
oyeron,: 0.24891723