# [**Build your own Transformer from scratch using Pytorch**](https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb)



## 1. Import necessary libraries and modules

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

src_vocab_size (Tamaño del Vocabulario de la Fuente): Este parámetro define el tamaño del vocabulario para el idioma de entrada (fuente). Un tamaño de vocabulario de 5000 significa que el modelo usará 5000 palabras/tokens únicos para representar el texto en el idioma de entrada.

tgt_vocab_size (Tamaño del Vocabulario del Objetivo): Similar al src_vocab_size, pero para el idioma de salida (objetivo). Define cuántas palabras/tokens únicos se usarán para representar el texto en el idioma de salida.

num_layers (Número de Capas): Este parámetro define cuántas capas de codificadores y decodificadores tiene el Transformer. Cada capa aprende diferentes aspectos de los datos. Aquí, 6 capas significan que hay 6 capas de codificadores y 6 capas de decodificadores en el modelo.


max_seq_length (Longitud Máxima de Secuencia): Define la longitud máxima de las secuencias de entrada y salida que el modelo puede manejar. Un valor de 100 significa que el modelo puede procesar secuencias de hasta 100 tokens.

dropout: Es una técnica para prevenir el sobreajuste en redes neuronales. El valor de 0.1 indica que hay una probabilidad del 10% de que cualquier neurona se "apague" durante el entrenamiento, lo que ayuda a que el modelo sea más robusto.


## 2. Define the basic building blocks: Multi-Head Attention, Position-wise Feed-Forward Networks, Positional Encoding
![Multi-Heads Attentions](resources\multi_head_attention.png)


In [26]:
class MultiHeadAttention(nn.Module):
    """The MultiHeadAttention code initializes the module with input parameters and linear transformation layers. It calculates attention scores, reshapes the input tensor into multiple heads, and combines the attention outputs from all heads. The forward method computes the multi-head self-attention, allowing the model to focus on some different aspects of the input sequence.    
    """
    def __init__(self, embedd_dim, num_heads):
        """
        Parameters:
        embedd_dim (int): Dimension del vector de embeddings para cada token.  Este  valor
        también determina el tamaño de las capas de atención y feed-forward en el modelo.

        num_heads (int): El número de cabezas en el Multi-Head Attention. Este valor debe 
        ser un divisor de 'embedd_dim' para permitir una distribución equitativa de las dimensiones a cada cabeza.
        """
        super(MultiHeadAttention, self).__init__()
        assert embedd_dim % num_heads == 0, "embedd_dim must be divisible by num_heads"

        self.embedd_dim = embedd_dim
        self.num_heads = num_heads

        # head_dim represent the dimension that each head will receive
        self.head_dim = embedd_dim // num_heads

        self.W_query = nn.Linear(embedd_dim, embedd_dim, bias=False)
        self.W_key = nn.Linear(embedd_dim, embedd_dim, bias=False)
        self.W_value = nn.Linear(embedd_dim, embedd_dim, bias=False)
        self.W_out = nn.Linear(embedd_dim, embedd_dim)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):

        # NOTE: torch.matmul perform a matrix multiplication in the last two dimensions
        attn_scores = torch.matmul(
            Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # after matmul, attn_scores has shape (batch_size, num_heads, seq_length, seq_length)

        if mask is not None:
            # mask is a tensor of shape (batch_size, 1, seq_length, seq_length)
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Apply softmax to attn_scores along the last dimension. For a specific element
        # (i, j, k, :), where i is the batch index, j is the head index, and k
        # is a specific sequence position. These probabilities in the last dimension indicate
        # the relative importance of each sequence position in calculating the attended
        # representation for the  position k in this example.
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # (batch_size, num_heads, seq_length, seq_length) X
        # (batch_size, num_heads, seq_length, head_dim)
        output = torch.matmul(attn_probs, V)
        # after matmul output has shape (batch_size, num_heads, seq_length, head_dim)
        return output

    def split_heads(self, x):
        batch_size, seq_length, embedd_dim = x.size()
        # return a tensor of shape (batch_size, num_heads, seq_length, head_dim)
        return x.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, num_heads, seq_length, head_dim = x.size()
        # return a tensor of shape (batch_size, seq_length, embedd_dim)
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embedd_dim)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_query(Q))
        K = self.split_heads(self.W_key(K))
        V = self.split_heads(self.W_value(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_out(self.combine_heads(attn_output))
        return output

In [27]:
class PositionWiseFeedForward(nn.Module):
    """ This process enables the model to consider the position of input elements while making predictions through the use of RELU."""

    def __init__(self, embedd_dim, feed_forward_dim):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(embedd_dim, feed_forward_dim)
        self.fc2 = nn.Linear(feed_forward_dim, embedd_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [28]:
class PositionalEncoding(nn.Module):
    """Positional Encoding is used to inject the position information of each token in the input sequence. It uses sine and cosine functions of different frequencies to generate the positional encoding
    """
    def __init__(self, embedd_dim, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, embedd_dim)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedd_dim, 2).float() * -(math.log(10000.0) / embedd_dim))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## 3. Build the Encoder and Decoder layers

The **EncoderLayer** class initializes with input parameters and components, including a MultiHeadAttention module, a PositionWiseFeedForward module, two layer normalization modules, and a dropout layer. The forward methods computes the encoder layer output by applying self-attention, adding the attention output to the input tensor, and normalizing the result. Then, it computes the position-wise feed-forward output, combines it with the normalized self-attention output, and normalizes the final result before returning the processed tensor.

![Encoder](resources\encoder.png)

In [29]:
class EncoderLayer(nn.Module):
    def __init__(self, embedd_dim, num_heads, feed_forward_dim, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(embedd_dim, num_heads)
        self.feed_forward = PositionWiseFeedForward(embedd_dim, feed_forward_dim)
        self.norm1 = nn.LayerNorm(embedd_dim)
        self.norm2 = nn.LayerNorm(embedd_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

The **DecoderLayer** initializes with input parameters and components such as MultiHeadAttention modules for masked self-attention and cross-attention, a PositionWiseFeedForward module, three layer normalization modules, and a dropout layer.

![decoder](resources\decoder.png)

In [30]:
class DecoderLayer(nn.Module):
    def __init__(self, embedd_dim, num_heads, feed_forward_dim, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(embedd_dim, num_heads)
        self.cross_attn = MultiHeadAttention(embedd_dim, num_heads)
        self.feed_forward = PositionWiseFeedForward(embedd_dim, feed_forward_dim)
        self.norm1 = nn.LayerNorm(embedd_dim)
        self.norm2 = nn.LayerNorm(embedd_dim)
        self.norm3 = nn.LayerNorm(embedd_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Calculate the masked self-attention output and add it to the input tensor, 
        # followed by dropout and layer normalization.
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Compute the cross-attention output between the decoder and encoder outputs, and
        # add it to the normalized masked self-attention output, followed by dropout and layer normalization.
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        # Calculate the position-wise feed-forward output and combine it with the normalized cross-attention output, followed by dropout and layer normalization.
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

## 4. Combine Encoder and Decoder layers to create the complete Transformer model
1. Generate source and target masks using the generate_mask method.
2. Compute source and target embeddings, and apply positional encoding and dropout.
3. Process the source sequence through encoder layers, updating the enc_output tensor.
4. Process the target sequence through decoder layers, using enc_output and masks, and updating the dec_output tensor.

5. Apply the linear projection layer to the decoder output, obtaining output logits.               
    
![Attention Is All You Need](resources\trasformer_architecture.png)

In [31]:
class Transformer(nn.Module):
    """
    A Transformer model for sequence-to-sequence tasks, based on the architecture 
    introduced by Vaswani et al. in the paper "Attention Is All You Need". 

    This implementation includes the encoder and decoder parts of the Transformer, 
    utilizing multi-head self-attention and position-wise feed-forward networks.

    Attributes:
        src_vocab_size (int): Size of the source vocabulary.
        tgt_vocab_size (int): Size of the target vocabulary.
        embedd_dim (int): Dimensionality of the embedding space.
        num_heads (int): Number of attention heads in the multi-head attention layers.
        num_layers (int): Number of encoder and decoder layers.
        feed_forward_dim (int): Dimensionality of the feed-forward networks.
        max_seq_length (int): Maximum sequence length that this model can process.
        dropout (float): Dropout rate for regularization in the transformer layers.
    """

    def __init__(self, src_vocab_size, tgt_vocab_size, embedd_dim, num_heads, num_layers, feed_forward_dim, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, embedd_dim)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, embedd_dim)

        self.positional_encoding = PositionalEncoding(
            embedd_dim, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(
            embedd_dim, num_heads, feed_forward_dim, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(
            embedd_dim, num_heads, feed_forward_dim, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(embedd_dim, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        """The generate_mask method creates binary masks for source and target sequences to ignore padding tokens and prevent the decoder from attending to future tokens.
        """
        # unsqueeze para agregar dos dimensiones adicionales y  alinear la máscara con las dimensiones esperadas (batch_size, 1, 1, seq_length).
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (
            1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(
            self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(
            self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

## 5. Prepare sample data

In [32]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)


# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

## 6. Example of training the model


In [37]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(),
                       lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer.to(device)
src_data = src_data.to(device)
tgt_data = tgt_data.to(device)
transformer.train()

for epoch in range(10):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size),
                     tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.482845306396484
Epoch: 2, Loss: 8.427691459655762
Epoch: 3, Loss: 8.35286808013916
Epoch: 4, Loss: 8.276569366455078
Epoch: 5, Loss: 8.194737434387207
Epoch: 6, Loss: 8.115641593933105
Epoch: 7, Loss: 8.036117553710938
Epoch: 8, Loss: 7.949494361877441
Epoch: 9, Loss: 7.86048698425293
Epoch: 10, Loss: 7.777308464050293
