<a href="https://colab.research.google.com/github/LeninGF/CoursesNotes/blob/main/InteligenciaArtificalGenerativa/Problems/transformers/EjercicioTransformersEncoder-IAG-2024B_LeninFalconi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformers Encoder



Coder: Lenin G. Falconí



Asignatura: Tópicos Especiales (Inteligencia Artificial)



Fecha: 2024-12-02

## Transformer con Pytorch desde torch.nn

In [1]:
import torch
import torch.nn as nn

d_model = 512
nhead = 8
num_encoder_layers = 1
num_decoder_layers = 6
print(f"torch version: {torch.__version__}")
model = nn.Transformer(d_model=d_model,
                       nhead=nhead,
                       num_encoder_layers=num_encoder_layers,
                       num_decoder_layers=num_decoder_layers)


torch version: 2.5.1+cu121




In [2]:
print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_fea

In [3]:
!pip install torchinfo



In [3]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                            Param #
Transformer                                                       --
├─TransformerEncoder: 1-1                                         --
│    └─ModuleList: 2-1                                            --
│    │    └─TransformerEncoderLayer: 3-1                          3,152,384
│    └─LayerNorm: 2-2                                             1,024
├─TransformerDecoder: 1-2                                         --
│    └─ModuleList: 2-3                                            --
│    │    └─TransformerDecoderLayer: 3-2                          4,204,032
│    │    └─TransformerDecoderLayer: 3-3                          4,204,032
│    │    └─TransformerDecoderLayer: 3-4                          4,204,032
│    │    └─TransformerDecoderLayer: 3-5                          4,204,032
│    │    └─TransformerDecoderLayer: 3-6                          4,204,032
│    │    └─TransformerDecoderLayer: 3-7             

In [6]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)  # Move model to the device

In [4]:
# Assume you have a sequence of numerical IDs:
input_sequence = torch.tensor([1, 5, 2, 8, 3])

# Create a simple (random) embedding layer:
embedding_layer = nn.Embedding(num_embeddings=10, embedding_dim=d_model) # 10 is the vocab size
embedded_input = embedding_layer(input_sequence)

# Reshape for Transformer input
embedded_input = embedded_input.unsqueeze(1) # Add batch dimension

#  Create a target sequence (can be the same as input for autoregressive tasks)
target_sequence = input_sequence
embedded_target = embedding_layer(target_sequence).unsqueeze(1) # Embed and add batch dimension

# Pass embedded input and target to the model
output = model(embedded_input, embedded_target) # Provide both src and tgt
output

tensor([[[ 0.2294, -0.6206, -0.6648,  ...,  0.6508,  0.2325, -0.6288]],

        [[-0.6362, -0.9210,  0.2839,  ..., -0.0827,  0.6265,  0.8557]],

        [[-0.0152,  0.1492,  0.4017,  ...,  0.6161,  1.6122,  0.1904]],

        [[ 0.4158,  0.0035,  0.1774,  ...,  0.0126, -0.2418, -0.1176]],

        [[ 0.3905, -1.0222, -0.5462,  ...,  0.6306,  0.3137,  0.2474]]],
       grad_fn=<NativeLayerNormBackward0>)

In [5]:
summary(model, input_data=(embedded_input, embedded_target))

Layer (type:depth-idx)                        Output Shape              Param #
Transformer                                   [5, 1, 512]               --
├─TransformerEncoder: 1-1                     [5, 1, 512]               --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [5, 1, 512]               3,152,384
│    └─LayerNorm: 2-2                         [5, 1, 512]               1,024
├─TransformerDecoder: 1-2                     [5, 1, 512]               --
│    └─ModuleList: 2-3                        --                        --
│    │    └─TransformerDecoderLayer: 3-2      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-3      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-4      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-5      [5, 1, 512]               4,204,032
│    │    └─TransformerDecoderLayer: 3-6      [5, 1, 512]

In [18]:
nn.Transformer?

## Transformer Encoder
- https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch
- https://campus.datacamp.com/es/courses/introduction-to-llms-in-python/building-a-transformer-architecture?ex=15


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

## Transformer Decoder

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x



In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory):
        tgt2 = self.self_attn(tgt, tgt, tgt)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory)[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt



In [None]:
class Decoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.pe = PositionalEncoding(d_model)

    def forward(self, tgt, memory):
        tgt = self.pe(tgt)
        for layer in self.layers:
            tgt = layer(tgt, memory)
        tgt = self.norm(tgt)
        return tgt



In [None]:
# Example usage
d_model = 512
nhead = 8
num_layers = 6
decoder = Decoder(d_model, nhead, num_layers)
memory = torch.rand(10, 32, d_model)
tgt = torch.rand(20, 32, d_model)
output = decoder(tgt, memory)
print(output.shape)


torch.Size([20, 32, 512])
