In [1]:
import torch
import torch.nn as nn

In [2]:
from multi_head_attention import MultiHeadAttention
from feed_forward import FeedForward
from sinusoidal_positional_encoding import PositionalEncoding
from embedding import EmbeddingModel

In [3]:
class EncoderBlock(nn.Module):
    def __init__(
        self, 
        d_model: int, 
        n_heads: int,
        hidden_size: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.feed_forward = FeedForward(d_model, hidden_size, dropout)

        self.norm_mha = nn.LayerNorm(normalized_shape=d_model)
        self.norm_feed_forward = nn.LayerNorm(normalized_shape=d_model)

    def forward(self, x):
        x = self.norm_mha(x + self.mha(x, x, x))  # add & norm
        x = self.norm_feed_forward(x + self.feed_forward(x))  # add & norm
        return x

In [4]:
class TransformerEncoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        seq_len: int,
        d_model: int,
        nx: int,
        n_heads: int,
        hidden_size: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Sequential(
            EmbeddingModel(d_model, vocab_size),
            PositionalEncoding(d_model, seq_len)
        )
        self.encoder_blocks = nn.ModuleList(
            [
                EncoderBlock(
                    d_model,
                    n_heads,
                    hidden_size,
                    dropout=dropout,
                )
                for _ in range(nx)
            ]
        )

    def forward(self, x):
        x = self.embedding(x)
        for block in self.encoder_blocks:
            x = block(x)
        return x

In [19]:
x = torch.rand((2, 3)) * 100
x = x.int()

In [20]:
print(f"entrada teste")
print(x)
print(f"Shape entrada: {teste.shape}")
print("-" * 30)

entrada teste
tensor([[77,  8, 55],
        [ 4, 44, 61]], dtype=torch.int32)
Shape entrada: torch.Size([2, 3])
------------------------------


In [21]:
encoder = TransformerEncoder(
    vocab_size=1000,
    seq_len=10,
    d_model=6,
    nx=2,
    n_heads=2,
    hidden_size=2
)

In [23]:
x_encoder = encoder(x)

In [24]:
print(f"Shape saida {x_encoder.shape}")

Shape saida torch.Size([2, 3, 6])


In [25]:
x_encoder

tensor([[[-1.6020,  0.8115,  0.3214, -0.9102,  0.0219,  1.3575],
         [-0.6117, -1.8504,  0.3489,  0.9417,  0.0824,  1.0892],
         [-0.2988, -1.0673, -0.2036,  0.1194, -0.6284,  2.0787]],

        [[-0.6783, -1.8136,  0.7091, -0.0582,  0.7620,  1.0789],
         [ 0.0130, -0.7409, -0.2270, -0.8691, -0.3088,  2.1328],
         [-0.6622, -1.6079, -0.1048,  0.4377,  0.2988,  1.6383]]],
       grad_fn=<NativeLayerNormBackward0>)