# Modelos de lenguaje neuronales

In [1]:
import pandas as pd
import torch
from NLPUtils import *
import re
import fasttext

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np

%load_ext autoreload
%autoreload 2

Vamos a definir que un modelo de lenguaje sobre un vocabulario $V = \{ p_1, \ldots, p_{|V|}\}$ es la capacidad de asignarle una probabilidad $P(w_1,\ldots,w_n)$ a cada conjunto de palabras $w_1,\ldots,w_n$ pertenecientes al vocabulario $V$.

Para calcular esa probabilidad suele hacerse un modelo de N-gramas, en que vale la aproximación

$$
\begin{align*}
P(w_1,\ldots,w_n) &= P(w_n|w_{n-1},\ldots,w_1) P(w_{n-1}|w_{n-2},\ldots,w_1)\ldots P(w_2|w_1) P(w_1)\\[.5em]
&= \prod_{i=1}^n P(w_i|w_{i-1},\ldots,w_1)\\[.5em]
&\approx \prod_{i=1}^n P(w_i|w_{i-1},\ldots,w_{i-N+1})\\
\end{align*}
$$

Ahora, para estimar estas probabilidades, pueden usarse métodos distintos. Uno de ellos consiste en estimar por métodos frecuentistas estas probabilidades:

$$
P(w_i|w_{i-1},\ldots,w_{i-N+1}) = \frac{C(w_i,\ldots,w_{i-N+1})}{C(w_{i-1},\ldots,w_{i-N+1})}
$$

Otro de estos métodos consiste en modelar la probabilidad $P(w_i|w_{i-1},\ldots,w_{i-N+1})$ con una red neuronal y entrenar sus parámetros con algún modelo (por ejemplo, word2vec).

Para medir si un modelo de lenguaje es bueno, suele utilizarse la perplejidad sobre un corpus determinado. Si se tiene que ese corpus contiene $M$ palabras $w_1, \ldots, w_m$ pertenecientes a un vocabulario $V$, la perplejidad se define como

$$
Per(w_1,\ldots,w_m) = \left(\frac{1}{P(w_1,\ldots,w_m)} \right)^{1/M}
$$

y si se tiene un modelo de N-gramas, esta fórmula se convierte en:

$$
Per(w_1,\ldots,w_m) = \left(\prod_{i=1}^M\frac{1}{P(w_i|w_{i-1},\ldots,w_{1-N+1})} \right)^{1/M}
$$

## Modelo de lenguaje frecuentista

El archivo `lm_l40` contiene la información del modelo de lenguaje estimado para un modelo de bigramas con el set de entrenamiento de la base de datos Latino-40. Con este dato, calculamos la perplejidad de este modelo de lenguaje sobre las frases de test.

In [2]:
# Obtenemos el modelo de lenguaje del método frecuentista:
with open('lm_l40_2', 'rb') as file:
    lm_file = file.read().decode('iso-8859-1')
    
def get_log_prob(w1,w2):
    match = re.search(r'([\-]?[\d]+[\.]?[\d]*)\t({} {})'.format(w1,w2),lm_file)
    if match is not None:
        log_prob, _ = match.groups()
        return float(log_prob)
    match_w1 = re.search(r'([\-]?[\d]+[\.]?[\d]*)\t({})\t*(\-?[\d]*[\.]?[\d]*)'.format(w1),lm_file)
    match_w2 = re.search(r'([\-]?[\d]+[\.]?[\d]*)\t({})\t*(\-?[\d]*[\.]?[\d]*)'.format(w2),lm_file)
    return float(match_w1.groups()[0]) + float(match_w2.groups()[2])   
        
# Juntamos el corpus de test en una sola lista:
with open('promptsl40.test','rb') as file:
    test_lines = file.readlines()
    test_lines = [' '.join(re.findall(r'\w+',line.decode('iso-8859-1'))[1:]) for line in test_lines]
    corpus_test = [line.split(' ') for line in test_lines]
    corpus_test = [word for line in corpus_test for word in line]

In [3]:
# Perplejidad para un modelo de bigrama:
corpus_len = len(corpus_test)
log_p = [get_log_prob(corpus_test[idx-1],corpus_test[idx]) for idx in range(1,corpus_len)]
log_p.insert(0,float(re.search(r'([\-]?[\d]+[\.]?[\d]*)\t({})\t*(\-?[\d]*[\.]?[\d]*)'.format(corpus_test[0]),lm_file).groups()[0]))
print('El logaritmo de la perplejidad para el corpus de test es: {}'.format(-sum(log_p)/corpus_len))

El logaritmo de la perplejidad para el corpus de test es: 1.7141915461883455


## Modelo de lenguaje neuronal

Cargamos el corpus de texto, conformado por todas las frases del conjunto de train.

In [8]:
with open('trainLM2.txt', 'rb') as file:
    lines = file.readlines()
    corpus = [line.decode('iso-8859-1').split(' ')[:-1] for line in lines]
    
print('Mostramos las primeras 10 frases del corpus:')
for i in range(10):
    print(corpus[i])
    
corpus = [[token for doc in corpus for token in doc]]

Mostramos las primeras 10 frases del corpus:
['no', 'habiendo', 'objeciones', 'así', 'quedó', 'acordado']
['a', 'fin', 'de', 'año', 'hará', 'el', 'balance', 'de', 'la', 'situación']
['este', 'programa', 'debe', 'ejecutarse', 'en', 'todos', 'los', 'niveles']
['no', 'debe', 'interpretarse', 'esto', 'como', 'que', 'uno', 'es', 'más', 'rico']
['la', 'pareja', 'partió', 'esta', 'madrugada', 'para', 'la', 'capital']
['a', 'la', 'sazón', 'los', 'locales', 'no', 'disfrutaban', 'de', 'inviolabilidad']
['la', 'princesa', 'del', 'japón', 'niega', 'que', 'esté', 'embarazada']
['la', 'esposa', 'y', 'su', 'hija', 'estaban', 'con', 'el', 'cantante', 'en', 'roma']
['ninguno', 'de', 'los', 'ataques', 'causó', 'daños', 'ni', 'heridos']
['esto', 'dio', 'lugar', 'a', 'que', 'se', 'reabrieran', 'ciertos', 'casos']


In [45]:
window_size = 4           # Tamaño de la ventana del contexto.
cutoff_freq = 0           # Palabras con una frecuencia menor o igual a cutoff_freq son excluídas del vocabulario.
batch_size = 512          # Tamaño del batch.

model = 'CBOW'            # Método de entrenamiento.
embedding_dim = 200       # Dimensión del espacio de los word vectors.
device = 'cuda:1'         # Dispositivo sobre el cual se entrena. 
state_dict = None         # Parámetros pre-entrenados.
paralelize = False        # Flag para decirle al programa que use las 2 gpus

epochs = 100              # Cantidad de epochs
learning_rate = 5e-4      # Tasa de aprendizaje
sample_loss_every = 10    # Calcular la loss cada este número
algorithm = 'Adam'        # Algoritmo de optimización

trainer = Word2vecTrainer(corpus,cutoff_freq=cutoff_freq,window_size=window_size,batch_size=batch_size)
trainer.InitModel(model=model, state_dict=state_dict, device=device, paralelize=paralelize, embedding_dim=embedding_dim)
trainer.Train(algorithm=algorithm, epochs=epochs, sample_loss_every=sample_loss_every, lr=learning_rate)

Word2vec trainer created:
Window size: 4
Number of samples: 38960
Vocabulary Size: 5922
Number of batches: 77
Number of samples per batch: 512

Dispositivo seleccionado: cuda:1
Dimensión del espacio de los embeddings: 200
Starting training...
Optimization method: Adam
Learning Rate: 0.0005
Number of epochs: 100
Running on device (cuda:1)

Epoch: 1, Batch number: 0, Loss: 4450.0400390625
Epoch: 1, Batch number: 10, Loss: 4426.947265625
Epoch: 1, Batch number: 20, Loss: 4404.03076171875
Epoch: 1, Batch number: 30, Loss: 4371.01220703125
Epoch: 1, Batch number: 40, Loss: 4354.013671875
Epoch: 1, Batch number: 50, Loss: 4320.98828125
Epoch: 1, Batch number: 60, Loss: 4318.06298828125
Epoch: 1, Batch number: 70, Loss: 4283.48583984375
Epoch: 2, Batch number: 3, Loss: 4189.49169921875
Epoch: 2, Batch number: 13, Loss: 4181.25732421875
Epoch: 2, Batch number: 23, Loss: 4133.60302734375
Epoch: 2, Batch number: 33, Loss: 4133.40380859375
Epoch: 2, Batch number: 43, Loss: 4070.8115234375
Epoch: 

Epoch: 20, Batch number: 67, Loss: 2160.429931640625
Epoch: 21, Batch number: 0, Loss: 2055.26953125
Epoch: 21, Batch number: 10, Loss: 2061.919921875
Epoch: 21, Batch number: 20, Loss: 2090.108154296875
Epoch: 21, Batch number: 30, Loss: 2110.325439453125
Epoch: 21, Batch number: 40, Loss: 2129.83349609375
Epoch: 21, Batch number: 50, Loss: 2092.767333984375
Epoch: 21, Batch number: 60, Loss: 2101.462646484375
Epoch: 21, Batch number: 70, Loss: 2075.478759765625
Epoch: 22, Batch number: 3, Loss: 2092.2548828125
Epoch: 22, Batch number: 13, Loss: 2053.43603515625
Epoch: 22, Batch number: 23, Loss: 2017.41650390625
Epoch: 22, Batch number: 33, Loss: 2034.486328125
Epoch: 22, Batch number: 43, Loss: 2054.9921875
Epoch: 22, Batch number: 53, Loss: 2013.8748779296875
Epoch: 22, Batch number: 63, Loss: 2011.052001953125
Epoch: 22, Batch number: 73, Loss: 2011.0006103515625
Epoch: 23, Batch number: 6, Loss: 1962.387939453125
Epoch: 23, Batch number: 16, Loss: 1935.56591796875
Epoch: 23, Batc

Epoch: 41, Batch number: 40, Loss: 1134.4254150390625
Epoch: 41, Batch number: 50, Loss: 1164.5372314453125
Epoch: 41, Batch number: 60, Loss: 1149.7550048828125
Epoch: 41, Batch number: 70, Loss: 1151.272216796875
Epoch: 42, Batch number: 3, Loss: 1100.7061767578125
Epoch: 42, Batch number: 13, Loss: 1161.05859375
Epoch: 42, Batch number: 23, Loss: 1093.52587890625
Epoch: 42, Batch number: 33, Loss: 1184.6134033203125
Epoch: 42, Batch number: 43, Loss: 1165.27392578125
Epoch: 42, Batch number: 53, Loss: 1103.10498046875
Epoch: 42, Batch number: 63, Loss: 1129.3348388671875
Epoch: 42, Batch number: 73, Loss: 1146.8382568359375
Epoch: 43, Batch number: 6, Loss: 1070.80126953125
Epoch: 43, Batch number: 16, Loss: 1077.5863037109375
Epoch: 43, Batch number: 26, Loss: 1136.0908203125
Epoch: 43, Batch number: 36, Loss: 1098.740478515625
Epoch: 43, Batch number: 46, Loss: 1046.66015625
Epoch: 43, Batch number: 56, Loss: 1113.2999267578125
Epoch: 43, Batch number: 66, Loss: 1047.829833984375


Epoch: 62, Batch number: 3, Loss: 617.4721069335938
Epoch: 62, Batch number: 13, Loss: 652.544677734375
Epoch: 62, Batch number: 23, Loss: 687.22412109375
Epoch: 62, Batch number: 33, Loss: 659.2599487304688
Epoch: 62, Batch number: 43, Loss: 668.0723876953125
Epoch: 62, Batch number: 53, Loss: 710.790771484375
Epoch: 62, Batch number: 63, Loss: 654.0540771484375
Epoch: 62, Batch number: 73, Loss: 665.5208129882812
Epoch: 63, Batch number: 6, Loss: 621.2459106445312
Epoch: 63, Batch number: 16, Loss: 643.775390625
Epoch: 63, Batch number: 26, Loss: 662.0103149414062
Epoch: 63, Batch number: 36, Loss: 616.5802612304688
Epoch: 63, Batch number: 46, Loss: 639.451171875
Epoch: 63, Batch number: 56, Loss: 643.6050415039062
Epoch: 63, Batch number: 66, Loss: 638.090576171875
Epoch: 63, Batch number: 76, Loss: 49.69245910644531
Epoch: 64, Batch number: 9, Loss: 623.1243896484375
Epoch: 64, Batch number: 19, Loss: 635.09423828125
Epoch: 64, Batch number: 29, Loss: 610.1123657226562
Epoch: 64, 

Epoch: 82, Batch number: 43, Loss: 357.6710510253906
Epoch: 82, Batch number: 53, Loss: 367.6031799316406
Epoch: 82, Batch number: 63, Loss: 360.42138671875
Epoch: 82, Batch number: 73, Loss: 353.9371337890625
Epoch: 83, Batch number: 6, Loss: 355.919677734375
Epoch: 83, Batch number: 16, Loss: 360.88385009765625
Epoch: 83, Batch number: 26, Loss: 394.7986755371094
Epoch: 83, Batch number: 36, Loss: 375.5718994140625
Epoch: 83, Batch number: 46, Loss: 359.4619140625
Epoch: 83, Batch number: 56, Loss: 342.2389221191406
Epoch: 83, Batch number: 66, Loss: 397.9269714355469
Epoch: 83, Batch number: 76, Loss: 31.569202423095703
Epoch: 84, Batch number: 9, Loss: 370.83099365234375
Epoch: 84, Batch number: 19, Loss: 380.9656677246094
Epoch: 84, Batch number: 29, Loss: 304.1452331542969
Epoch: 84, Batch number: 39, Loss: 313.5855407714844
Epoch: 84, Batch number: 49, Loss: 373.6716613769531
Epoch: 84, Batch number: 59, Loss: 300.06024169921875
Epoch: 84, Batch number: 69, Loss: 367.70419311523

In [46]:
fig, ax = plt.subplots()
ax.plot(trainer.loss_history['iter'],trainer.loss_history['loss'])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fbef34fdf98>]

In [49]:
# Perplejidad para un modelo de bigrama:
corpus_len = len(corpus_test)
vocab = Vocabulary.from_corpus([corpus_test])
trainer.model.eval()
scores = trainer.model(torch.tensor(vocab[corpus_test[idx]],device=torch.device(device)))
log_p_nn = [scores[vocab[corpus_test[idx]]] - torch.logsumexp(scores,dim=0)]
for idx in range(1,corpus_len):
    x = torch.tensor(vocab[corpus_test[idx]],device=torch.device(device))
    print(x)
    scores = trainer.model(x)
    print(scores)
    log_p_nn.append(scores[vocab[corpus_test[idx-1]]] - torch.logsumexp(scores,dim=0))

trainer.model.train()
print('El logaritmo de la perplejidad para el corpus de test es: {}'.format(-sum(log_p_nn)/corpus_len))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)