# Modelos de lenguaje neuronales

In [50]:
import pandas as pd
import torch
from NLPUtils import *
import re
import fasttext

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Vamos a definir que un modelo de lenguaje sobre un vocabulario $V = \{ p_1, \ldots, p_{|V|}\}$ es la capacidad de asignarle una probabilidad $P(w_1,\ldots,w_n)$ a cada conjunto de palabras $w_1,\ldots,w_n$ pertenecientes al vocabulario $V$.

Para calcular esa probabilidad suele hacerse un modelo de N-gramas, en que vale la aproximación

$$
\begin{align*}
P(w_1,\ldots,w_n) &= P(w_n|w_{n-1},\ldots,w_1) P(w_{n-1}|w_{n-2},\ldots,w_1)\ldots P(w_2|w_1) P(w_1)\\[.5em]
&= \prod_{i=1}^n P(w_i|w_{i-1},\ldots,w_1)\\[.5em]
&\approx \prod_{i=1}^n P(w_i|w_{i-1},\ldots,w_{i-N+1})\\
\end{align*}
$$

Ahora, para estimar estas probabilidades, pueden usarse métodos distintos. Uno de ellos consiste en estimar por métodos frecuentistas estas probabilidades:

$$
P(w_i|w_{i-1},\ldots,w_{i-N+1}) = \frac{C(w_i,\ldots,w_{i-N+1})}{C(w_{i-1},\ldots,w_{i-N+1})}
$$

Otro de estos métodos consiste en modelar la probabilidad $P(w_i|w_{i-1},\ldots,w_{i-N+1})$ con una red neuronal y entrenar sus parámetros con algún modelo (por ejemplo, word2vec).

Para medir si un modelo de lenguaje es bueno, suele utilizarse la perplejidad sobre un corpus determinado. Si se tiene que ese corpus contiene $M$ palabras $w_1, \ldots, w_m$ pertenecientes a un vocabulario $V$, la perplejidad se define como

$$
Per(w_1,\ldots,w_m) = \left(\frac{1}{P(w_1,\ldots,w_m)} \right)^{1/M}
$$

y si se tiene un modelo de N-gramas, esta fórmula se convierte en:

$$
Per(w_1,\ldots,w_m) = \left(\prod_{i=1}^M\frac{1}{P(w_i|w_{i-1},\ldots,w_{1-N+1})} \right)^{1/M}
$$

## Modelo de lenguaje frecuentista

El archivo `lm_l40` contiene la información del modelo de lenguaje estimado para un modelo de bigramas con el set de entrenamiento de la base de datos Latino-40. Con este dato, calculamos la perplejidad de este modelo de lenguaje sobre las frases de test.

In [46]:
# Obtenemos el modelo de lenguaje del método frecuentista:
with open('lm_l40', 'rb') as file:
    lines = [line.decode('iso-8859-1') for line in file]
    idx = lines.index('\\data\\\n') + 1
    line = lines[idx]
    ngrams_len = []
    log_probs = []
    while line[:5] == 'ngram':
        ngrams_len.append(int(line.split('=')[1][:-1]))
        idx += 1
        line = lines[idx]
    N = len(ngrams_len)
    for i in range(1,N+1):
        idx = lines.index('\\{}-grams:\n'.format(i)) + 1
        lp = {re.split(r'[\t\n]',lines[j])[1]: float(lines[j].split('\t')[0]) for j in range(idx,idx+ngrams_len[i-1])}
        log_probs.append(lp)
        
# Juntamos el corpus de test en una sola lista:
with open('promptsl40.test','rb') as file:
    test_lines = file.readlines()
    test_lines = [' '.join(re.findall(r'\w+',line.decode('iso-8859-1'))[1:]) for line in test_lines]
    corpus_test = [line.split(' ') for line in test_lines]
    corpus_test = [word for line in corpus_test for word in line]

In [51]:
# Perplejidad para un modelo de bigrama:
corpus_len = len(corpus_test)
p = log_probs[0][corpus_test[0]]
for idx in range(1,corpus_len):
    p += log_probs[1]['{} {}'.format(corpus_test[idx],corpus_test[idx-1])]
print('El logaritmo de la perplejidad para el corpus de test es: {}'.format(p))

KeyError: 'jugará no'

## Modelo de lenguaje neuronal

Cargamos el corpus de texto, conformado por todas las frases del conjunto de train.

In [58]:
with open('trainLM.txt', 'rb') as file:
    lines = file.readlines()
    corpus = [line.decode('iso-8859-1').split(' ')[:-1] for line in lines]
    
print('Mostramos las primeras 10 frases del corpus:')
for i in range(10):
    print(corpus[i])

Mostramos las primeras 10 frases del corpus:
['no', 'habiendo', 'objeciones', 'así', 'quedó', 'acordado']
['a', 'fin', 'de', 'año', 'hará', 'el', 'balance', 'de', 'la', 'situación']
['este', 'programa', 'debe', 'ejecutarse', 'en', 'todos', 'los', 'niveles']
['no', 'debe', 'interpretarse', 'esto', 'como', 'que', 'uno', 'es', 'más', 'rico']
['la', 'pareja', 'partió', 'esta', 'madrugada', 'para', 'la', 'capital']
['a', 'la', 'sazón', 'los', 'locales', 'no', 'disfrutaban', 'de', 'inviolabilidad']
['la', 'princesa', 'del', 'japón', 'niega', 'que', 'esté', 'embarazada']
['la', 'esposa', 'y', 'su', 'hija', 'estaban', 'con', 'el', 'cantante', 'en', 'roma']
['ninguno', 'de', 'los', 'ataques', 'causó', 'daños', 'ni', 'heridos']
['esto', 'dio', 'lugar', 'a', 'que', 'se', 'reabrieran', 'ciertos', 'casos']


In [59]:
corpus = [token for doc in corpus for token in doc]

In [60]:
window_size = 8           # Tamaño de la ventana del contexto.
cutoff_freq = 0           # Palabras con una frecuencia menor o igual a cutoff_freq son excluídas del vocabulario.
batch_size = 512          # Tamaño del batch.

model = 'SkipGram'        # Método de entrenamiento.
embedding_dim = 200       # Dimensión del espacio de los word vectors.
device = 'cpu'            # Dispositivo sobre el cual se entrena. 
state_dict = None         # Parámetros pre-entrenados.
paralelize = False        # Flag para decirle al programa que use las 2 gpus

epochs = 100              # Cantidad de epochs
learning_rate = 1e-4      # Tasa de aprendizaje
sample_loss_every = 10    # Calcular la loss cada este número
algorithm = 'Adam'        # Algoritmo de optimización

trainer = Word2vecTrainer(corpus,cutoff_freq=cutoff_freq,window_size=window_size,batch_size=batch_size)
trainer.InitModel(model=model, state_dict=state_dict, device=device, paralelize=paralelize, embedding_dim=embedding_dim)
trainer.Train(algorithm=algorithm, epochs=epochs, sample_loss_every=sample_loss_every, lr=learning_rate)

Word2vec trainer created:
Window size: 8
Number of samples: 154775
Vocabulary Size: 31
Number of batches: 303
Number of samples per batch: 512

Dispositivo seleccionado: cuda:1
Dimensión del espacio de los embeddings: 200
Starting training...
Optimization method: Adam
Learning Rate: 0.0001
Number of epochs: 100
Running on device (cuda:1)

Epoch: 1, Batch number: 0, Loss: 10636.29296875
Epoch: 1, Batch number: 10, Loss: 9898.3359375
Epoch: 1, Batch number: 20, Loss: 10452.6259765625
Epoch: 1, Batch number: 30, Loss: 9855.4169921875
Epoch: 1, Batch number: 40, Loss: 9837.552734375
Epoch: 1, Batch number: 50, Loss: 9421.8837890625
Epoch: 1, Batch number: 60, Loss: 10011.5966796875
Epoch: 1, Batch number: 70, Loss: 9768.5556640625
Epoch: 1, Batch number: 80, Loss: 9356.1396484375
Epoch: 1, Batch number: 90, Loss: 9353.4521484375
Epoch: 1, Batch number: 100, Loss: 9323.74609375
Epoch: 1, Batch number: 110, Loss: 9005.5146484375
Epoch: 1, Batch number: 120, Loss: 9150.4267578125
Epoch: 1, Ba

Epoch: 6, Batch number: 85, Loss: 8102.77685546875
Epoch: 6, Batch number: 95, Loss: 8584.005859375
Epoch: 6, Batch number: 105, Loss: 8082.03076171875
Epoch: 6, Batch number: 115, Loss: 8260.0439453125
Epoch: 6, Batch number: 125, Loss: 8006.3828125
Epoch: 6, Batch number: 135, Loss: 7806.8701171875
Epoch: 6, Batch number: 145, Loss: 8294.93359375
Epoch: 6, Batch number: 155, Loss: 7983.57177734375
Epoch: 6, Batch number: 165, Loss: 8141.7373046875
Epoch: 6, Batch number: 175, Loss: 7997.16943359375
Epoch: 6, Batch number: 185, Loss: 7999.60400390625
Epoch: 6, Batch number: 195, Loss: 8333.587890625
Epoch: 6, Batch number: 205, Loss: 8271.9921875
Epoch: 6, Batch number: 215, Loss: 8132.25146484375
Epoch: 6, Batch number: 225, Loss: 8387.0546875
Epoch: 6, Batch number: 235, Loss: 8235.1171875
Epoch: 6, Batch number: 245, Loss: 8322.2587890625
Epoch: 6, Batch number: 255, Loss: 8157.2578125
Epoch: 6, Batch number: 265, Loss: 7876.14111328125
Epoch: 6, Batch number: 275, Loss: 8035.26757

Epoch: 11, Batch number: 210, Loss: 8143.48828125
Epoch: 11, Batch number: 220, Loss: 8201.0009765625
Epoch: 11, Batch number: 230, Loss: 8227.861328125
Epoch: 11, Batch number: 240, Loss: 8083.69140625
Epoch: 11, Batch number: 250, Loss: 8142.3779296875
Epoch: 11, Batch number: 260, Loss: 8137.94775390625
Epoch: 11, Batch number: 270, Loss: 8269.83203125
Epoch: 11, Batch number: 280, Loss: 7566.77587890625
Epoch: 11, Batch number: 290, Loss: 8120.6220703125
Epoch: 11, Batch number: 300, Loss: 8162.2724609375
Epoch: 12, Batch number: 7, Loss: 7953.22216796875
Epoch: 12, Batch number: 17, Loss: 8368.0439453125
Epoch: 12, Batch number: 27, Loss: 8267.734375
Epoch: 12, Batch number: 37, Loss: 7946.98828125
Epoch: 12, Batch number: 47, Loss: 8423.013671875
Epoch: 12, Batch number: 57, Loss: 7741.76220703125
Epoch: 12, Batch number: 67, Loss: 7957.744140625
Epoch: 12, Batch number: 77, Loss: 8260.5693359375
Epoch: 12, Batch number: 87, Loss: 8261.3427734375
Epoch: 12, Batch number: 97, Loss

Epoch: 17, Batch number: 12, Loss: 8182.9091796875
Epoch: 17, Batch number: 22, Loss: 8107.83154296875
Epoch: 17, Batch number: 32, Loss: 8089.79248046875
Epoch: 17, Batch number: 42, Loss: 7990.04638671875
Epoch: 17, Batch number: 52, Loss: 8180.92578125
Epoch: 17, Batch number: 62, Loss: 8479.69921875
Epoch: 17, Batch number: 72, Loss: 8030.751953125
Epoch: 17, Batch number: 82, Loss: 8039.501953125
Epoch: 17, Batch number: 92, Loss: 7772.091796875
Epoch: 17, Batch number: 102, Loss: 7990.73974609375
Epoch: 17, Batch number: 112, Loss: 7709.52783203125
Epoch: 17, Batch number: 122, Loss: 8165.6748046875
Epoch: 17, Batch number: 132, Loss: 8187.55419921875
Epoch: 17, Batch number: 142, Loss: 8287.31640625
Epoch: 17, Batch number: 152, Loss: 8393.8603515625
Epoch: 17, Batch number: 162, Loss: 8077.19970703125
Epoch: 17, Batch number: 172, Loss: 7827.92919921875
Epoch: 17, Batch number: 182, Loss: 8136.572265625
Epoch: 17, Batch number: 192, Loss: 8046.1171875
Epoch: 17, Batch number: 2

Epoch: 22, Batch number: 107, Loss: 7776.19921875
Epoch: 22, Batch number: 117, Loss: 8217.640625
Epoch: 22, Batch number: 127, Loss: 7980.37255859375
Epoch: 22, Batch number: 137, Loss: 8091.48828125
Epoch: 22, Batch number: 147, Loss: 8000.94921875
Epoch: 22, Batch number: 157, Loss: 8673.806640625
Epoch: 22, Batch number: 167, Loss: 8250.69921875
Epoch: 22, Batch number: 177, Loss: 8054.2255859375
Epoch: 22, Batch number: 187, Loss: 8082.73486328125
Epoch: 22, Batch number: 197, Loss: 8068.23681640625
Epoch: 22, Batch number: 207, Loss: 8319.9208984375
Epoch: 22, Batch number: 217, Loss: 8398.732421875
Epoch: 22, Batch number: 227, Loss: 7923.07177734375
Epoch: 22, Batch number: 237, Loss: 8422.08203125
Epoch: 22, Batch number: 247, Loss: 8353.7275390625
Epoch: 22, Batch number: 257, Loss: 8186.6171875
Epoch: 22, Batch number: 267, Loss: 8315.1337890625
Epoch: 22, Batch number: 277, Loss: 8249.45703125
Epoch: 22, Batch number: 287, Loss: 8196.6181640625
Epoch: 22, Batch number: 297,

Epoch: 27, Batch number: 232, Loss: 8165.07666015625
Epoch: 27, Batch number: 242, Loss: 8319.720703125
Epoch: 27, Batch number: 252, Loss: 8248.96875
Epoch: 27, Batch number: 262, Loss: 8216.1640625
Epoch: 27, Batch number: 272, Loss: 8092.2802734375
Epoch: 27, Batch number: 282, Loss: 7940.4521484375
Epoch: 27, Batch number: 292, Loss: 8252.6572265625
Epoch: 27, Batch number: 302, Loss: 2327.505126953125
Epoch: 28, Batch number: 9, Loss: 8244.1328125
Epoch: 28, Batch number: 19, Loss: 7961.1767578125
Epoch: 28, Batch number: 29, Loss: 7925.72216796875
Epoch: 28, Batch number: 39, Loss: 8201.939453125
Epoch: 28, Batch number: 49, Loss: 8187.94091796875
Epoch: 28, Batch number: 59, Loss: 8047.6904296875
Epoch: 28, Batch number: 69, Loss: 8047.88330078125
Epoch: 28, Batch number: 79, Loss: 8012.72705078125
Epoch: 28, Batch number: 89, Loss: 8411.3212890625
Epoch: 28, Batch number: 99, Loss: 7879.48046875
Epoch: 28, Batch number: 109, Loss: 8223.966796875
Epoch: 28, Batch number: 119, Lo

Epoch: 33, Batch number: 34, Loss: 8315.29296875
Epoch: 33, Batch number: 44, Loss: 8122.43017578125
Epoch: 33, Batch number: 54, Loss: 8323.7197265625
Epoch: 33, Batch number: 64, Loss: 7989.98291015625
Epoch: 33, Batch number: 74, Loss: 8103.57958984375
Epoch: 33, Batch number: 84, Loss: 7979.8857421875
Epoch: 33, Batch number: 94, Loss: 8220.99609375
Epoch: 33, Batch number: 104, Loss: 8127.89208984375
Epoch: 33, Batch number: 114, Loss: 8129.65771484375
Epoch: 33, Batch number: 124, Loss: 7952.1611328125
Epoch: 33, Batch number: 134, Loss: 8007.0654296875
Epoch: 33, Batch number: 144, Loss: 8295.6328125
Epoch: 33, Batch number: 154, Loss: 8376.146484375
Epoch: 33, Batch number: 164, Loss: 8214.69921875
Epoch: 33, Batch number: 174, Loss: 7965.6767578125
Epoch: 33, Batch number: 184, Loss: 8301.2626953125
Epoch: 33, Batch number: 194, Loss: 8038.3408203125
Epoch: 33, Batch number: 204, Loss: 7984.72265625
Epoch: 33, Batch number: 214, Loss: 8062.72509765625
Epoch: 33, Batch number: 

Epoch: 38, Batch number: 129, Loss: 7997.2001953125
Epoch: 38, Batch number: 139, Loss: 8301.64453125
Epoch: 38, Batch number: 149, Loss: 8062.083984375
Epoch: 38, Batch number: 159, Loss: 8082.71240234375
Epoch: 38, Batch number: 169, Loss: 8165.67431640625
Epoch: 38, Batch number: 179, Loss: 8042.98388671875
Epoch: 38, Batch number: 189, Loss: 8155.41943359375
Epoch: 38, Batch number: 199, Loss: 8302.8994140625
Epoch: 38, Batch number: 209, Loss: 7902.595703125
Epoch: 38, Batch number: 219, Loss: 8096.39453125
Epoch: 38, Batch number: 229, Loss: 8060.6796875
Epoch: 38, Batch number: 239, Loss: 8275.0263671875
Epoch: 38, Batch number: 249, Loss: 8191.7529296875
Epoch: 38, Batch number: 259, Loss: 7887.37890625
Epoch: 38, Batch number: 269, Loss: 8119.23046875
Epoch: 38, Batch number: 279, Loss: 8523.8037109375
Epoch: 38, Batch number: 289, Loss: 8026.0673828125
Epoch: 38, Batch number: 299, Loss: 8049.99267578125
Epoch: 39, Batch number: 6, Loss: 7953.94287109375
Epoch: 39, Batch numb

In [61]:
fig, ax = plt.subplots()
ax.plot(trainer.loss_history['iter'],trainer.loss_history['loss'])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7f0cdc3f8a90>]

In [71]:
# Perplejidad para un modelo de bigrama:
corpus_len = len(corpus_test)
vocab = Vocabulary.from_corpus([corpus_test])
activation = nn.LogSoftmax()
for idx in range(1,corpus_len):
    x = torch.tensor(vocab[corpus_test[idx]],device=torch.device(device))
    log_probs = activation(trainer.model(x))
    print(log_probs)

RuntimeError: CUDA error: device-side assert triggered