<a href="https://colab.research.google.com/github/LeninGF/CoursesNotes/blob/main/InteligenciaArtificalGenerativa/Problems/transformers/EjercicioTransformersEncoder-IAG-2024B_LeninFalconi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformers Encoder



Coder: Lenin G. Falconí



Asignatura: Tópicos Especiales (Inteligencia Artificial)



Fecha: 2024-12-02

# Transformer Encoder

Para realizar un transformer Encoder se requiere de:

1. Embedding Layer
2. Positional Encoding
3. Pila de capas de Encoder
4. La salida que sería un classification head

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

### MultiHead attention
 the MultiHeadAttention class encapsulates the multi-head attention mechanism commonly used in transformer models. It takes care of splitting the input into multiple attention heads, applying attention to each head, and then combining the results. By doing so, the model can capture various relationships in the input data at different scales, improving the expressive ability of the model.
`scaled_dot_product_attention`: the attention scores are calculated by taking the dot product of queries (Q) and keys (K), and then scaling by the square root of the key dimension (d_k).

`attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)`

`split_heads`: This method reshapes the input x into the shape (batch_size, num_heads, seq_length, d_k). It enables the model to process multiple attention heads concurrently, allowing for parallel computation.

`combine_heads`: combines the results back into a single tensor of shape (batch_size, seq_length, d_model)

`forward`: The forward method is where the actual computation happens:

In [None]:
class MultiHeadAttention(nn.Module):
  """
  d_model: Dimensionality of the input.
  num_heads: The number of attention heads to split the input into.
  d_model is divisible by num_heads

  """
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value

        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

### Position Wise Feed Forward
defines a position-wise feed-forward neural network that consists of two linear layers with a ReLU activation function in between. In the context of transformer models, this feed-forward network is applied to each position separately and identically. It helps in transforming the features learned by the attention mechanisms within the transformer, acting as an additional processing step for the attention outputs.

In [None]:
class PositionWiseFeedForward(nn.Module):
  """
  d_model: Dimensionality of the input.
  d_ff: Dimensionality of the inner layer in the feed-forward network.
  """
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

### Positional Encoding
The PositionalEncoding class adds information about the position of tokens within the sequence. Since the transformer model lacks inherent knowledge of the order of tokens (due to its self-attention mechanism), this class helps the model to consider the position of tokens in the sequence. The sinusoidal functions used are chosen to allow the model to easily learn to attend to relative positions, as they produce a unique and smooth encoding for each position in the sequence.

`max_seq_length`: The maximum length of the sequence for which positional encodings are pre-computed.
`pe`: A tensor filled with zeros, which will be populated with positional encodings.
`position`: A tensor containing the position indices for each position in the sequence.
`div_term`: A term used to scale the position indices in a specific way.

The sine function is applied to the even indices and the cosine function to the odd indices of pe.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

###  Encoder Layer

The EncoderLayer class defines a single layer of the transformer's encoder. It encapsulates a multi-head self-attention mechanism followed by position-wise feed-forward neural network, with residual connections, layer normalization, and dropout applied as appropriate. These components together allow the encoder to capture complex relationships in the input data and transform them into a useful representation for downstream tasks. Typically, multiple such encoder layers are stacked to form the complete encoder part of a transformer model.

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

### Encoder Transformer

## Transformer con Pytorch desde torch.nn

In [None]:
import torch
import torch.nn as nn

d_model = 512
nhead = 8
num_encoder_layers = 1
num_decoder_layers = 6
print(f"torch version: {torch.__version__}")
model = nn.Transformer(d_model=d_model,
                       nhead=nhead,
                       num_encoder_layers=num_encoder_layers,
                       num_decoder_layers=num_decoder_layers)


torch version: 2.5.1+cu121




In [None]:
print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_fea

In [None]:
!pip install torchinfo



In [None]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                            Param #
Transformer                                                       --
├─TransformerEncoder: 1-1                                         --
│    └─ModuleList: 2-1                                            --
│    │    └─TransformerEncoderLayer: 3-1                          3,152,384
│    └─LayerNorm: 2-2                                             1,024
├─TransformerDecoder: 1-2                                         --
│    └─ModuleList: 2-3                                            --
│    │    └─TransformerDecoderLayer: 3-2                          4,204,032
│    │    └─TransformerDecoderLayer: 3-3                          4,204,032
│    │    └─TransformerDecoderLayer: 3-4                          4,204,032
│    │    └─TransformerDecoderLayer: 3-5                          4,204,032
│    │    └─TransformerDecoderLayer: 3-6                          4,204,032
│    │    └─TransformerDecoderLayer: 3-7             

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)  # Move model to the device

In [None]:
# Assume you have a sequence of numerical IDs:
input_sequence = torch.tensor([1, 5, 2, 8, 3])

# Create a simple (random) embedding layer:
embedding_layer = nn.Embedding(num_embeddings=10, embedding_dim=d_model) # 10 is the vocab size
embedded_input = embedding_layer(input_sequence)

# Reshape for Transformer input
embedded_input = embedded_input.unsqueeze(1) # Add batch dimension

#  Create a target sequence (can be the same as input for autoregressive tasks)
target_sequence = input_sequence
embedded_target = embedding_layer(target_sequence).unsqueeze(1) # Embed and add batch dimension

# Pass embedded input and target to the model
output = model(embedded_input, embedded_target) # Provide both src and tgt
output

tensor([[[ 0.2294, -0.6206, -0.6648,  ...,  0.6508,  0.2325, -0.6288]],

        [[-0.6362, -0.9210,  0.2839,  ..., -0.0827,  0.6265,  0.8557]],

        [[-0.0152,  0.1492,  0.4017,  ...,  0.6161,  1.6122,  0.1904]],

        [[ 0.4158,  0.0035,  0.1774,  ...,  0.0126, -0.2418, -0.1176]],

        [[ 0.3905, -1.0222, -0.5462,  ...,  0.6306,  0.3137,  0.2474]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
summary(model, input_data=(embedded_input, embedded_target))

Layer (type:depth-idx)                        Output Shape              Param #
Transformer                                   [5, 1, 512]               --
├─TransformerEncoder: 1-1                     [5, 1, 512]               --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [5, 1, 512]               3,152,384
│    └─LayerNorm: 2-2                         [5, 1, 512]               1,024
├─TransformerDecoder: 1-2                     [5, 1, 512]               --
│    └─ModuleList: 2-3                        --                        --
│    │    └─TransformerDecoderLayer: 3-2      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-3      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-4      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-5      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-6      [5, 1, 512]

In [None]:
nn.Transformer?

## Transformer Encoder
- https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch
- https://campus.datacamp.com/es/courses/introduction-to-llms-in-python/building-a-transformer-architecture?ex=15


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

## Transformer Decoder

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x



In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory):
        tgt2 = self.self_attn(tgt, tgt, tgt)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory)[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt



In [None]:
class Decoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.pe = PositionalEncoding(d_model)

    def forward(self, tgt, memory):
        tgt = self.pe(tgt)
        for layer in self.layers:
            tgt = layer(tgt, memory)
        tgt = self.norm(tgt)
        return tgt



In [None]:
# Example usage
d_model = 512
nhead = 8
num_layers = 6
decoder = Decoder(d_model, nhead, num_layers)
memory = torch.rand(10, 32, d_model)
tgt = torch.rand(20, 32, d_model)
output = decoder(tgt, memory)
print(output.shape)


torch.Size([20, 32, 512])
