# Natural Language Processing en Pytorch

In [1]:
import torch

## Tutorial 2: Implementación de Skip-Gram Word2Vec

### 1. Repaso del Modelo Skip-Gram

El objetivo es calcular la probabilidad de $P(\mathbf{O}|\mathbf{C})$ (probabilidad de las palabras de alrededor dada la palabra centra).

Podemos aproximar esta probabilidad con un modelo de Softmax:

$$
P(\mathbf{O}=o|\mathbf{C}=c) = \frac{\exp(\Theta x_c)}{\sum_{w\in Vocab}\exp(\theta_w^T x_c)}
$$

donde 

$$
\Theta = 
\begin{bmatrix}
-\;\theta_1^T \;- \\
-\; \theta_2^T \;- \\
\vdots \\
-\; \theta_{|V|}^T \; -
\end{bmatrix}
$$

y $x_c$ es un *one-hot* vector en la palabra $c$ del vocabulario $V$.
Si se define

$$
\Theta = U V
$$

con 

$$
\begin{align}
U = 
\begin{bmatrix}
- \; u_1^T \; - \\
- \; u_2^T \; - \\
\vdots \\
- \; u_{|V|}^T \; -
\end{bmatrix} & &
V = 
\begin{bmatrix}
| & | & & | \\
v_1 & v_2 & \cdots & v_{|V|} \\
| & | & & | \\
\end{bmatrix} & & 
u_i , v_i \in \mathbb{R}^{n} \; i= 1, \ldots, |V|
\end{align}
$$

puede verse que la expresión de la probabilidad condicional anterior queda

$$
\begin{align}
P(\mathbf{O}=o|\mathbf{C}=c) &= \frac{\exp(\Theta c)}{\sum_{w\in V}\exp(\theta_w^T c)} \\[.5em]
&= \frac{\exp(u_o^T v_c)}{\sum_{w\in V}\exp(u_w^T v_c)}
\end{align}
$$

donde $v_c$ es la columna de $V$ correspondiente a la plabra $c$ del vocabulario y $u_o^T$ es la fila $o$ de $U$ correspondiente a la palabra $o$ del vocabulario.

El siguiente paso es encontrar los parámetros de la distribución anterior, lo cual se hará minimizando la función de costo *Negative LogLikelihood* por gradiente estocástico. Para eso, se puede utilizar las relaciones

$$
\begin{align}
\frac{\partial J}{\partial U} = \frac{\partial J}{\partial \Theta} V^T & \hspace{3em} &
\frac{\partial J}{\partial V} = U^T \frac{\partial J}{\partial \Theta} 
\end{align}
$$

con $J(U,V) = NLL(P(o|c))$, o hacer el cálculo a mano a partir de la expresión completa:

$$
\begin{align}
J(U,V) &= - \log \left( \prod_{i=1}^N \prod_{j=1}^{|V|} P(o_i = j | x_i)^{\mathbb{1}_{\{y_i = j\} }} \right)
\end{align}
$$

En nuestro caso, vamos a hacer la implementación del gradiente descendiente con Pytorch, así que en principio no importan mucho estas fórmulas.

### 2. Corpus AG NEWS

Para los experimentos se va a usar el corpus de texto **AG NEWS**, que consiste en un conjunto de noticias junto con sus respectivas categorías. Cada una de estas noticias puede pertenecer a exactamente una de las siguientes categorías:

* Internacional (*World*)
* Deportes (*Sports*)
* Negocios (*Business*)
* Ciencia y Tecnología (*Sci/Tec*)

En el Tutorial 1 se explicó cómo entrenar un clasificador Softmax sin utilizar word embeddings (o entrenándolos junto con el modelo). El objetivo de este tutorial va a consistir en dos partes:

1. Entrenar los word embeddings de las palabras del vocabulario del corpus con el modelo Skip-Gram
2. Clasificar una cantidad de frases de las noticias según su respectiva categoría con un clasificador lineal con activación Softmax. 

Primero, definimos el dataset con las muestras de clasificación y el dataset para entrenar el modelo Skip-Gram.

In [2]:
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('../AG_NEWS'):
    os.mkdir('../AG_NEWS')


class AgNewsClassification(torch.utils.data.Dataset):
    
    unk_token = 'UNK_TOKEN'
    pad_token = 'PAD_TOKEN'
    
    def __init__(self, root_dir='./AG_NEWS', n_grams=2, train=True):
        
        super(AgNewsClassification, self).__init__()
        train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
            root=root_dir, ngrams=n_grams, vocab=None)
        
        if train:
            self.samples = train_dataset._data
            self.vocabulary = list(dict(train_dataset._vocab.freqs).keys())
            self.freqs = dict(train_dataset._vocab.freqs)
        else:
            self.samples = test_dataset._data
            self.vocabulary = list(dict(test_dataset._vocab.freqs).keys())
            self.freqs = dict(test_dataset._vocab.freqs)
            
        self.vocabulary.insert(0,self.pad_token)
        self.vocabulary.insert(1,self.unk_token)
        self.word_to_index = {w: idx for (idx, w) in enumerate(self.vocabulary)}
        self.index_to_word = {idx: w for (idx, w) in enumerate(self.vocabulary)}
        self.size_of_longest_sentence = max([len(sample[1]) for sample in self.samples])
        self.categories = ['World', 'Sports', 'Business', 'Sci/Tec']
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        label, text = self.samples[idx]
        text = torch.nn.functional.pad(text, 
                                       pad=(0,self.size_of_longest_sentence - len(text)),
                                       mode='constant', 
                                       value=self.word_to_index[self.pad_token])
        return text, label
    
class AgNewsSkipGram(torch.utils.data.Dataset):
    
    unk_token = 'UNK_TOKEN'
    pad_token = 'PAD_TOKEN'
    
    def __init__(self, root_dir='./AG_NEWS', n_grams=2, train=True, context_size=2):
        
        super(AgNewsSkipGram, self).__init__()
        train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
            root=root_dir, ngrams=n_grams, vocab=None)
        
        self.context_size = context_size
        
        if train:
            dataset = train_dataset
            n_sentences = len(train_dataset._data)
            self.vocabulary = list(dict(train_dataset._vocab.freqs).keys())
            self.freqs = dict(train_dataset._vocab.freqs)
        else:
            dataset = test_dataset
            n_sentences = len(test_dataset._data)
            self.vocabulary = list(dict(test_dataset._vocab.freqs).keys())
            self.freqs = dict(test_dataset._vocab.freqs)
            
        
        self.vocabulary.insert(0,self.pad_token)
        self.vocabulary.insert(1,self.unk_token)
        self.word_to_index = {w: idx for (idx, w) in enumerate(self.vocabulary)}
        self.index_to_word = {idx: w for (idx, w) in enumerate(self.vocabulary)}
        
        self.samples = self.getSamples([dataset[i][1] for i in range(n_sentences)])
    
    def getSamples(self, corpus):
        samples = []
        for sentence_idx in corpus:
            sentence = [self.index_to_word[int(j)] for j in sentence_idx]
            for i, word in enumerate(sentence):
                first_context_word_index = max(0,i-self.context_size)
                last_context_word_index = min(i+self.context_size+1, len(sentence))
                context = [self.pad_token for j in range(i-self.context_size,first_context_word_index)] + \
                          sentence[first_context_word_index:i] + \
                          sentence[i+1:last_context_word_index] + \
                          [self.pad_token for j in range(last_context_word_index,i+self.context_size+1)]
                samples.append((word, context))
                
        return samples
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        word, context = self.samples[idx]
        idx_context = torch.empty(len(context), dtype=torch.long)
        idx_word = torch.tensor(self.word_to_index[word], dtype=torch.long)
        for i, w in enumerate(context):
            idx_context[i] = self.word_to_index[w]

        return idx_word, idx_context

Datasets con las muestras para entrenar el modelo Skip-Gram:

In [3]:
context_size = 4
skg_train_dataset = AgNewsSkipGram(root_dir='../AG_NEWS', n_grams=2, train=True, context_size=context_size)
skg_val_dataset = AgNewsSkipGram(root_dir='../AG_NEWS', n_grams=2, train=True, context_size=context_size)
skg_test_dataset = AgNewsSkipGram(root_dir='../AG_NEWS', n_grams=2, train=False, context_size=context_size)

120000lines [00:04, 25902.36lines/s]
120000lines [00:08, 14242.72lines/s]
7600lines [00:00, 14548.79lines/s]
120000lines [00:04, 26238.38lines/s]
120000lines [00:08, 14335.69lines/s]
7600lines [00:00, 14314.11lines/s]
120000lines [00:04, 26311.80lines/s]
120000lines [00:08, 14496.14lines/s]
7600lines [00:00, 14458.83lines/s]


In [4]:
batch_size = 64 # Tamaño del batch
val_size = .02
NUM_TRAIN = int((1 - val_size) * len(skg_train_dataset))
NUM_VAL = len(skg_train_dataset) - NUM_TRAIN
sampler = lambda start, end: torch.utils.data.SubsetRandomSampler(range(start, end)) # Función para mezclar aleatoriamente las muestras


# Dataloader para las muestras de entrenamiento:
train_dataloader = torch.utils.data.DataLoader(skg_train_dataset, 
                                               batch_size=batch_size, 
                                               sampler=sampler(0, NUM_TRAIN))

# Dataloader para las muestras de validación:
val_dataloader = torch.utils.data.DataLoader(skg_val_dataset, 
                                             batch_size=batch_size, 
                                             sampler=sampler(NUM_TRAIN, NUM_TRAIN+NUM_VAL))

# Dataloader para las muestras de testeo:
test_dataloader = torch.utils.data.DataLoader(skg_test_dataset, 
                                              batch_size=batch_size)

### 2. Definición del modelo

Arquitectura del modelo:

1. Un Layer `nn.Embedding` que busca en una tabla los word embeddings de los índices ingresados a la entrada. 
2. Un Layer `nn.Linear` que devuelve los scores de cada categoría.
3. Una activación softmax para calcular las probabilidades de cada clase.

In [30]:
import torch.nn as nn

class SkipGram(nn.Module):
    
    def __init__(self, n_vectors, embedding_dim):
        
        super(SkipGram, self).__init__()
        self.emb = nn.Embedding(n_vectors, embedding_dim)
        self.linear = nn.Linear(embedding_dim, n_vectors)
        
    def forward(self, x):
        emb = self.emb(x)
        scores = self.linear(emb)
        return scores
    
    def loss(self, scores, target):
        cross_entropy = nn.CrossEntropyLoss()
        mean_loss = 0 
        context_size = target.size()[1]
        for i in range(context_size):
            mean_loss += cross_entropy(scores, target[:,i])
        return mean_loss / context_size
    
embedding_dim = 10
tokens = skg_train_dataset.word_to_index
nWords = len(skg_train_dataset.vocabulary)
skg_model = SkipGram(nWords, embedding_dim)

### 3. Entrenamiento de los word embeddings

In [32]:
import torch.optim as optim

def CheckAccuracy(loader, model, device, input_dtype, target_dtype):  
    num_correct = 0
    num_samples = 0
    model.eval()  
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=input_dtype)  
            y = y.to(device=device, dtype=target_dtype)
            
            scores = model(x)
            _, preds = scores.max(dim=1)
            num_correct += torch.tensor([(preds == y[:,i]).sum() for i in range(y.size()[1])]).sum()
            num_samples += preds.size(0)

        return num_correct, num_samples
        

def TrainModel(model, data, epochs=1, learning_rate=1e-2, sample_loss_every=100):
    
    input_dtype = data['input_dtype'] 
    target_dtype = data['target_dtype']
    device = data['device']
    train_dataloader = data['train_dataloader']
    val_dataloader = data['val_dataloader']
    
    performance_history = {'iter': [], 'loss': [], 'accuracy': []}
    
    model = model.to(device=device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    batch_size = len(train_dataloader)
    for e in range(epochs):
        for t, (x,y) in enumerate(train_dataloader):
            model.train()
            x = x.to(device=device, dtype=input_dtype)
            y = y.to(device=device, dtype=target_dtype)

            # Forward pass
            scores = model(x) 
            
            # Backward pass
            loss = model.loss(scores,y)                 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (e * batch_size + t) % sample_loss_every == 0:
                num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
                performance_history['iter'].append(t)
                performance_history['loss'].append(loss.item())
                performance_history['accuracy'].append(float(num_correct) / num_samples)
                print('Epoch: %d, Iteration: %d, Accuracy: %d/%d ' % (e, t, num_correct, num_samples))
                
    num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
    print('Final accuracy: %.2f%%' % (100 * float(num_correct) / num_samples) )
    
    return performance_history

# Especificaciones de cómo adquirir los datos para entrenamiento:
use_gpu = True
if torch.cuda.is_available() and use_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

data = {
    'device': device,
    'input_dtype': torch.long,
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 10 # Cantidad de epochs
sample_loss_every = 10 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 5e-1 # Tasa de aprendizaje

# Entrenamiento:
performance_history = TrainModel(skg_model, data, epochs, learning_rate, sample_loss_every)

Epoch: 0, Iteration: 0, Accuracy: 1/205345 
Epoch: 0, Iteration: 10, Accuracy: 1/205345 
Epoch: 0, Iteration: 20, Accuracy: 1/205345 
Epoch: 0, Iteration: 30, Accuracy: 24/205345 
Epoch: 0, Iteration: 40, Accuracy: 121/205345 
Epoch: 0, Iteration: 50, Accuracy: 406/205345 
Epoch: 0, Iteration: 60, Accuracy: 1171/205345 
Epoch: 0, Iteration: 70, Accuracy: 2224/205345 
Epoch: 0, Iteration: 80, Accuracy: 4471/205345 
Epoch: 0, Iteration: 90, Accuracy: 7521/205345 
Epoch: 0, Iteration: 100, Accuracy: 12940/205345 
Epoch: 0, Iteration: 110, Accuracy: 17145/205345 
Epoch: 0, Iteration: 120, Accuracy: 21386/205345 
Epoch: 0, Iteration: 130, Accuracy: 26719/205345 
Epoch: 0, Iteration: 140, Accuracy: 30753/205345 
Epoch: 0, Iteration: 150, Accuracy: 33363/205345 
Epoch: 0, Iteration: 160, Accuracy: 36058/205345 
Epoch: 0, Iteration: 170, Accuracy: 37760/205345 
Epoch: 0, Iteration: 180, Accuracy: 40012/205345 
Epoch: 0, Iteration: 190, Accuracy: 40764/205345 
Epoch: 0, Iteration: 200, Accuracy

KeyboardInterrupt: 

### 4. Entrenamiento del modelo de clasificación

In [8]:
train_dataset = AgNewsClassification(root_dir='../AG_NEWS', n_grams=2, train=True)
val_dataset = AgNewsClassification(root_dir='../AG_NEWS', n_grams=2, train=True)
test_dataset = AgNewsClassification(root_dir='../AG_NEWS', n_grams=2, train=False)

120000lines [00:04, 26250.52lines/s]
120000lines [00:08, 14708.33lines/s]
7600lines [00:00, 14600.29lines/s]
120000lines [00:04, 26221.32lines/s]
120000lines [00:08, 14453.33lines/s]
7600lines [00:00, 14422.32lines/s]
120000lines [00:04, 26269.79lines/s]
120000lines [00:08, 14503.17lines/s]
7600lines [00:00, 14342.76lines/s]


In [33]:
batch_size = 1024 # Tamaño del batch
val_size = .02
NUM_TRAIN = int((1 - val_size) * len(train_dataset))
NUM_VAL = len(train_dataset) - NUM_TRAIN
sampler = lambda start, end: torch.utils.data.SubsetRandomSampler(range(start, end)) # Función para mezclar aleatoriamente las muestras


# Dataloader para las muestras de entrenamiento:
model_train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=batch_size, 
                                               sampler=sampler(0, NUM_TRAIN))

# Dataloader para las muestras de validación:
model_val_dataloader = torch.utils.data.DataLoader(val_dataset, 
                                             batch_size=batch_size, 
                                             sampler=sampler(NUM_TRAIN, NUM_TRAIN+NUM_VAL))

# Dataloader para las muestras de testeo:
model_test_dataloader = torch.utils.data.DataLoader(test_dataset, 
                                              batch_size=batch_size)

In [34]:
import torch.nn as nn

class SoftmaxClassifier(nn.Module):
    
    def __init__(self, n_vocab, n_classes, embedding_dim, pre_trained_embeddings):
        
        super(SoftmaxClassifier, self).__init__()
        if pre_trained_embeddings is not None:
            self.emb = nn.Embedding(pre_trained_embeddings.size(0), pre_trained_embeddings.size(1))
            self.emb.weight = nn.Parameter(pre_trained_embeddings)
            self.emb.weight.requires_grad = False
        else:
            self.emb = nn.Embedding(n_vocab, embedding_dim)
        self.linear = nn.Linear(embedding_dim, n_classes)
        
    def forward(self, x):
        emb = self.emb(x).mean(dim=1)
        scores = self.linear(emb)
        return scores
    
    def loss(self, scores, target):
        cross_entropy = nn.CrossEntropyLoss()
        return cross_entropy(scores, target)
    
embedding_dim = 10
tokens = train_dataset.word_to_index
nWords = len(train_dataset.vocabulary)
Classifier = SoftmaxClassifier(nWords, len(train_dataset.categories), embedding_dim, skg_model.emb.weight)
#Classifier = SoftmaxClassifier(nWords, len(train_dataset.categories), embedding_dim, None)

In [35]:
import torch.optim as optim

def CheckAccuracy(loader, model, device, input_dtype, target_dtype):  
    num_correct = 0
    num_samples = 0
    model.eval()  
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=input_dtype)  
            y = y.to(device=device, dtype=target_dtype)
            
            scores = model(x)
            _, preds = scores.max(dim=1)
            num_correct += (preds == y).sum() 
            num_samples += preds.size(0)

        return num_correct, num_samples
        

def TrainModel(model, data, epochs=1, learning_rate=1e-2, sample_loss_every=100):
    
    input_dtype = data['input_dtype'] 
    target_dtype = data['target_dtype']
    device = data['device']
    train_dataloader = data['train_dataloader']
    val_dataloader = data['val_dataloader']
    
    performance_history = {'iter': [], 'loss': [], 'accuracy': []}
    
    model = model.to(device=device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    batch_size = len(train_dataloader)
    for e in range(epochs):
        for t, (x,y) in enumerate(train_dataloader):
            model.train()
            x = x.to(device=device, dtype=input_dtype)
            y = y.to(device=device, dtype=target_dtype)

            # Forward pass
            scores = model(x) 
            
            # Backward pass
            loss = model.loss(scores,y)                 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (e * batch_size + t) % sample_loss_every == 0:
                num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
                performance_history['iter'].append(t)
                performance_history['loss'].append(loss.item())
                performance_history['accuracy'].append(float(num_correct) / num_samples)
                print('Epoch: %d, Iteration: %d, Accuracy: %d/%d ' % (e, t, num_correct, num_samples))
                
    num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
    print('Final accuracy: %.2f%%' % (100 * float(num_correct) / num_samples) )
    
    return performance_history

# Especificaciones de cómo adquirir los datos para entrenamiento:
use_gpu = True
if torch.cuda.is_available() and use_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

data = {
    'device': device,
    'input_dtype': torch.long,
    'target_dtype': torch.long,
    'train_dataloader': model_train_dataloader,
    'val_dataloader': model_val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 100 # Cantidad de epochs
sample_loss_every = 100 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-3 # Tasa de aprendizaje

# Entrenamiento:
performance_history = TrainModel(Classifier, data, epochs, learning_rate, sample_loss_every)

Epoch: 0, Iteration: 0, Accuracy: 665/2400 
Epoch: 0, Iteration: 100, Accuracy: 665/2400 
Epoch: 1, Iteration: 85, Accuracy: 665/2400 
Epoch: 2, Iteration: 70, Accuracy: 665/2400 
Epoch: 3, Iteration: 55, Accuracy: 665/2400 
Epoch: 4, Iteration: 40, Accuracy: 665/2400 
Epoch: 5, Iteration: 25, Accuracy: 665/2400 
Epoch: 6, Iteration: 10, Accuracy: 665/2400 
Epoch: 6, Iteration: 110, Accuracy: 665/2400 
Epoch: 7, Iteration: 95, Accuracy: 665/2400 
Epoch: 8, Iteration: 80, Accuracy: 666/2400 
Epoch: 9, Iteration: 65, Accuracy: 666/2400 
Epoch: 10, Iteration: 50, Accuracy: 665/2400 
Epoch: 11, Iteration: 35, Accuracy: 663/2400 
Epoch: 12, Iteration: 20, Accuracy: 660/2400 
Epoch: 13, Iteration: 5, Accuracy: 659/2400 
Epoch: 13, Iteration: 105, Accuracy: 650/2400 
Epoch: 14, Iteration: 90, Accuracy: 649/2400 
Epoch: 15, Iteration: 75, Accuracy: 652/2400 
Epoch: 16, Iteration: 60, Accuracy: 657/2400 
Epoch: 17, Iteration: 45, Accuracy: 659/2400 
Epoch: 18, Iteration: 30, Accuracy: 664/2400 

KeyboardInterrupt: 