# Transformer

En este laboratorio vamos a implementar una arquitectura de transformer desde cero. Recuerden usar la GPU de colab para acelerar el entrenamiento.

In [None]:
#!pip install spacy
#!pip install typing_extensions==4.7.1
#!pip install torch==1.8.0 torchtext==0.9.0

In [None]:
import copy
import math
import random
import time

import pandas as pd
import spacy
import torch
import torch.nn as nn
import torchtext
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torchtext.legacy.data import Field, BucketIterator, TabularDataset

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Download English and French data from Spacy
spacy.cli.download("en")
spacy.cli.download("fr")

# Datos

Vamos a seguir trabajando con los datos de parejas de oraciones en Frances-w Inglés.

    I am cold.    J'ai froid.


In [None]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

In [None]:
# Take a peek at the dataset
dataset = pd.read_csv("data/eng-fra.txt", sep="\t", header=None)
dataset.columns = ["English", "French"]
dataset

In [None]:
# dataset = dataset.sample(int(len(dataset)*0.4))

In [None]:
# Remove very long sentences

MAX_SEQ_LEN = 50

dataset['en_len'] = dataset['English'].str.count(' ')
dataset['fr_len'] = dataset['French'].str.count(' ')
dataset = dataset[
    (dataset['fr_len'] < MAX_SEQ_LEN) & 
    (dataset['en_len'] < MAX_SEQ_LEN)
][['English', 'French']]

In [None]:
# Split dataset into train, val and test
train, val_test = train_test_split(dataset, test_size=0.2, random_state=RANDOM_SEED)
val, test = train_test_split(val_test, test_size=0.5)

# Save splits to CSV files
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
test.to_csv("test.csv", index=False)

In [None]:
# Load English and French models
en = spacy.load('en_core_web_sm')
fr = spacy.load('fr_core_news_sm')

def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]
  
def tokenize_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]

EN_TEXT = Field(tokenize=tokenize_en, fix_length=MAX_SEQ_LEN)
FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>", fix_length=MAX_SEQ_LEN)

In [None]:
# Associate the text in the 'English' column with the EN_TEXT field,
# and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]

train, val = TabularDataset.splits(
    path='./',
    train='train.csv',
    validation='val.csv',
    format='csv',
    fields=data_fields
)

In [None]:
# Build vocabularies
FR_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

# Construct a train iterator
train_iter = BucketIterator(
    train,
    batch_size=32,
    sort_key=lambda x: len(x.French),
    shuffle=True
)

# Armando el transformer paso a paso

![Transformer architecture](https://miro.medium.com/max/1140/1*2vyKzFlzIHfSmOU_lnQE4A.png)

El diagrama ilustra el modelo que vamos a implementar. Los inputs al encoder son las oraciones en Frances, y los "Outputs" que entran al decoder son las sentencias en Inglés.

Necesitamos entender 5 procesos para implementar el modelo:
- Embedding de los inputs
- Encoding Posicional
- Creación de máscaras
- La capa de Multi-Head Attention
- La capa Feed-Forward

## Encoding Posicional
----
El embedding de cada palabra aprende su significado, ahora necesitamos una manera de que la red aprenda sobre la posicion de cada palabra en la sentencia.

[Vaswani *et al.*](https://arxiv.org/abs/1706.03762) respondió esta pregunta usando las siguientes funciones para crear valores constantes relacionados a cada posición:

$$ PE_{(pos, 2i)} = sin\left(\frac{pos}{10000^{2i/d_{model}}}\right) $$
$$ PE_{(pos, 2i+1)} = cos\left(\frac{pos}{10000^{2i/d_{model}}}\right) $$

Esta constante es una matriz en 2D con una de las dimensiones de igual tamaño que los embeddings y la otra igual a la cantidad de palabras en la sentencia.

![Positional encoding matrix](https://miro.medium.com/max/1359/1*B-VR6R5vJl3Y7jbMNf5Fpw.png)

![Positional encoding example](http://jalammar.github.io/images/t/transformer_positional_encoding_example.png)

In [None]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=MAX_SEQ_LEN, dropout=0.1):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        
        # Create constant 'pe' matrix with values dependant on pos and i
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = 1.0 / torch.pow(10000, torch.arange(0, d_model, 2).float() / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        # We register them as a buffer so the optimzer doesn't see this as parameters of the model to optimize!
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # Make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        # Add constant to embedding
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

## Máscaras para los inputs
----
Las máscaras de ceros cumplen dos propósitos:

- En **ambos** encoder y decoder: Para obtener 0s en la atención sobre el padding.
- En el **decoder**: Prevenir que el decoder "espíe" a los siguientes inputs de la secuencia traducida, el futuro que no debería conocer para predecir la siguiente palabra. Evita que el decoder "vea" lo que tiene que predecir antes de predecirlo.

In [None]:
def build_attn_pad_mask(seq, pad_index):
    # Creates mask with 0s wherever there is padding in the input
    return (seq != pad_index).unsqueeze(1).type(torch.uint8)


def build_nopeak_mask(size):
    # Creates mask with 1s up until the the index of the word being predicted
    nopeak_mask = torch.triu(torch.ones(size, size)).transpose(0, 1)
    return nopeak_mask.unsqueeze(0).type(torch.uint8)

def create_masks(src, src_pad, trg=None, trg_pad=None):
    src_mask = build_attn_pad_mask(src, src_pad)

    if trg is not None:
        trg_mask = build_attn_pad_mask(trg, trg_pad)
        size = trg.size(1) # get seq_len for matrix
        np_mask = build_nopeak_mask(size)
        trg_mask = trg_mask & np_mask
    else: 
        trg_mask = None

    return src_mask, trg_mask

## Multi-Head Attention
----

![Multihead attention schema](https://miro.medium.com/max/1254/1*1tsRtfaY9z6HxmERYhw8XQ.png)

$V$, $K$ y $Q$ reprensentan ‘key’, ‘value’ and ‘query’. En el caso del Encoder, $V$, $K$ and $Q$ serán simplemente copias idénticas del vector de embedding (junto con el encoding posicional). Tendrán las siguientes dimensiones $\text{batch_size} \times \text{seq_len} \times d_\text{model}$.

En multi-head attention repartimos el vector de embedding en $N$ cabezas, por lo que tendrán las dimensiones: $\text{batch_size} \times N \times \text{seq_len} \times (d_{\text{model}} / N)$.

La dimensión final: ($d_{\text{model}} / N$) es a lo que llamaremos $d_k$.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, attn_pdrop = 0.1, resid_pdrop = 0.1):
        super(MultiHeadAttention, self).__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        # Q, K, V

        # Regularization

        # Output projection

    
    def forward(self, q, k, v, mask=None):
        # Get batch size
        bs = q.size(0)
        
        # Perform linear operation and split into h heads

        # Transpose to get dimensions bs * h * sl * d_model

        # Calculate attention scores using function we will define next

        # Re-assemble all head outputs side by side
        # Need to use contigous here to get correspondence
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
        
        # Output projection

        return output

### Mecanismo de Atención
----

![Attention diagram](https://miro.medium.com/max/336/1*15E9qKg9bKnWdSRWCyY2iA.png)
![Attention equation](https://miro.medium.com/max/1068/1*evdACdTOBT5j1g1nXialBg.png)

Inicialmente, multiplicamos $Q$ por la transpuesta de $K$. Esto es luego dividipo por $\sqrt{d_k}$ (normalización).

Algo que aún no vimos es qué hacer con atención y las máscaras. Antes de hacer el Softmax, aplicamos nuestra máscara de ceros para reducir los valores donde el input es padding (o futuro).

Finalmente, el último paso es hacer el producto (dot product) entre el resultado hasta ahora y $V$.

In [None]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
    
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

## Capa Feed-Forward
----
Esta capa solo consiste en dos opeaciones lineales (nn.Linear) con ReLU y dropout entre ellas.

In [None]:
class FeedForward(nn.Module):
  # ??
  pass

# Combinando todo

Vamos a crear una capa EncoderLayer y DecoderLayer que agrupan los componentes necesarios para crear un solo encoder (o decoder).

Luego, creamos el Encoder de nuestra arquitectura, conteniendo N de estos bloques anteriores. Repetimos para el decoder.

![Transformer architecture](https://miro.medium.com/max/1140/1*2vyKzFlzIHfSmOU_lnQE4A.png)


In [None]:
# Build an encoder layer with one multi-head attention layer and one
# feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super(EncoderLayer, self).__init__()
        self.attn = MultiHeadAttention(heads, d_model, dropout, dropout)
        self.ff = FeedForward(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # ??
        return x
    
# Build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        # ??
        return x

# Convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super(Encoder, self).__init__()
        self.N = N
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)

    def forward(self, src, mask):
        # ??
        return x
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super(Decoder, self).__init__()
        self.N = N
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)

    def forward(self, trg, e_outputs, src_mask, trg_mask):
        # ??
        return x

Finalmente usando los dos bloques anteriores y una capa linear creamos el transformer con $N$ encoders/decoders!

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super(Transformer, self).__init__()
        # ??

    def forward(self, src, trg, src_mask, trg_mask):
        # ??
        return output

# Entrenando

In [None]:
d_model = 128
heads = 4
N = 3
src_vocab = len(EN_TEXT.vocab)
trg_vocab = len(FR_TEXT.vocab)

model = Transformer(src_vocab, trg_vocab, d_model, N, heads).to(DEVICE)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# This code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See this blog for a mathematical explanation:
# https://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization

In [None]:
def train_model(epochs, print_every=100):
    model.train()
    
    start = time.time()
    temp = start
    total_loss = 0

    src_pad = EN_TEXT.vocab.stoi['<pad>']
    trg_pad = FR_TEXT.vocab.stoi['<pad>']

    for epoch in range(epochs):
        total_loss = 0       
        for i, batch in enumerate(train_iter):
            src = batch.English.transpose(0, 1)
            trg = batch.French.transpose(0, 1)
            # the French sentence we input has all words except
            # the last, as it is using each word to predict the next
            trg_input = trg[:, :-1]
            
            # the words we are trying to predict
            targets = trg[:, 1:].contiguous().view(-1)
            
            # create function to make masks using mask code above
            src_mask, trg_mask = create_masks(src, src_pad, trg_input, trg_pad)

            preds = model(src.to(DEVICE), trg_input.to(DEVICE), src_mask.to(DEVICE), trg_mask.to(DEVICE))

            optim.zero_grad()

            loss = F.cross_entropy(
                preds.view(-1, preds.size(-1)),
                targets.to(DEVICE),
                ignore_index=trg_pad
            )
            loss.backward()
            optim.step()
            total_loss += loss.item()
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % (
                    (time.time() - start) // 60, epoch + 1, i + 1, loss_avg, time.time() - temp, print_every)
                )
                total_loss = 0
                temp = time.time()

In [None]:
optim = torch.optim.Adam(model.parameters(), lr=0.0002)

train_model(1) # Train for 10 epochs (~60min)  

In [None]:
def translate(model, src, max_len=80):
  model.eval()

  input_pad = EN_TEXT.vocab.stoi['<pad>']

  src = tokenize_en(src)
  src = (torch.LongTensor([[EN_TEXT.vocab.stoi[tok] for tok in src]])).cuda()

  src_mask = (src != input_pad).unsqueeze(-2).cuda()
  e_outputs = model.encoder(src, src_mask)
      
  outputs = torch.zeros(max_len).type_as(src.data)
  outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']])

  for i in range(1, max_len):    
    trg_mask = torch.triu(torch.ones((1, i, i))).type(torch.uint8)
    trg_mask = ((trg_mask) == 0).cuda()
          
    out = model.out(model.decoder(outputs[:i].unsqueeze(0), e_outputs, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    val, ix = out[:, -1].data.topk(1)
    
    outputs[i] = ix[0][0]

    if ix[0][0] == FR_TEXT.vocab.stoi['<eos>']:
        break

  return ' '.join([FR_TEXT.vocab.itos[ix] for ix in outputs[:i]])

In [None]:
translate(model, "How are you ?")