In [2]:
import torch.nn as nn
import torch

class EncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_size, dropout=0.1):
        super().__init__()

        # Configuração da camada de atenção
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)

        # Camada de avanço
        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, embed_dim),
        )

        # Normalização
        self.norm_1 = nn.LayerNorm(embed_dim)
        self.norm_2 = nn.LayerNorm(embed_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Ativação de auto-atendimento (self-attention)
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)
        x = self.norm_1(x)

        # Feedforward
        ff_output = self.feedforward(x)
        x = x + self.dropout(ff_output)
        x = self.norm_2(x)

        return x

# Exemplo de uso:
embed_dim = 512
num_heads = 8
hidden_size = 2048
dropout = 0.1

encoder_block = EncoderBlock(embed_dim, num_heads, hidden_size, dropout)

# Testando com um tensor de entrada
input_tensor = torch.randn((10, 32, embed_dim))  # (sequence_length, batch_size, embed_dim)
output_tensor = encoder_block(input_tensor)
print(output_tensor.shape)


torch.Size([10, 32, 512])


In [20]:
import torch
import torch.nn as nn
from torch import Tensor
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.0, max_len: int = 1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
class FeedFowardBlock(nn.Module):
    def __init__(self, embed_dim, hidden_size, dropout: int = 0.1):
        super().__init__()

        self.ff_1 = nn.Linear(embed_dim, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.ff_2 = nn.Linear(hidden_size, embed_dim)

    def forward(self, x):
        x = self.ff_1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.ff_2(x)
        return x

class EncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_size, dropout: int = 0.1):
        super().__init__()

        self.attention = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout
            )

        self.feedforward = FeedFowardBlock(
                embed_dim=embed_dim, hidden_size=hidden_size, dropout=dropout
            )

        self.norm_1 = nn.LayerNorm(normalized_shape=embed_dim)
        self.norm_2 = nn.LayerNorm(normalized_shape=embed_dim)

    def forward(self, x):
        x = self.norm_1(x + self.attention(x))
        x = self.norm_2(x + self.feedforward(x))
        return x

    
class TransformerEncoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        max_len,
        embed_dim,
        num_layers,
        num_heads,
        hidden_size,
        dropout: int = 0.1,
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim=embed_dim)
        self.pos_encoding = PositionalEncoding(
            d_model=embed_dim,
            dropout=dropout,
            max_len=max_len,
        )

        self.encoder_blocks = nn.ModuleList()
        for _ in range(num_layers):
            self.encoder_blocks.append(
                EncoderBlock(
                    embed_dim=embed_dim,
                    num_heads=num_heads,
                    hidden_size=hidden_size,
                    dropout=dropout,
                )
            )

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoding(x)

        for block in self.encoder_blocks:
            x = block(x)

        return x

In [21]:
# TODO: treinamos um encoder para aprender a linguagem
import torch
import torch.nn as nn

VOCAB_SIZE = 30_000
MAX_LEN = 16
D_MODEL = 32
N_LAYERS = 8
N_HEADS = 8
HIDDEN_SIZE = 4 


class PreTrainedDairai(nn.Module):
    def __init__(
            self,
            vocab_size: int = VOCAB_SIZE,
            max_len: int = MAX_LEN,
            embed_dim: int = D_MODEL,
            num_layers: int = N_LAYERS,
            num_heads: int = N_HEADS,
            hidden_size: int = HIDDEN_SIZE,
            dropout: float = 0.05
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size

        self.encoder = TransformerEncoder(
            vocab_size=vocab_size,
            max_len=max_len,
            embed_dim=embed_dim,
            num_layers=num_layers,
            num_heads=num_heads,
            hidden_size=hidden_size,
            dropout=dropout,
        )

        self.mlm_head = nn.Linear(self.embed_dim, self.vocab_size)
    
    def forward(self, x, mask):
        masked_ids = torch.flatten(mask.reshape((-1,)).nonzero())
        last_hidden_states = self.encoder(x)
        all_hidden_states = last_hidden_states.reshape(-1, self.embed_dim)
        masked_hidden_states = all_hidden_states[masked_ids, :]
        logits = self.mlm_head(masked_hidden_states)
        return logits

In [22]:
model = PreTrainedDairai(
        vocab_size=VOCAB_SIZE,
        max_len=MAX_LEN,
        embed_dim=D_MODEL,
        num_layers=N_LAYERS,
        num_heads=N_HEADS,
        hidden_size=HIDDEN_SIZE,
        dropout=0.1,
)

In [23]:
model

PreTrainedDairai(
  (encoder): TransformerEncoder(
    (embedding): Embedding(30000, 32)
    (pos_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_blocks): ModuleList(
      (0-7): 8 x EncoderBlock(
        (attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (feedforward): FeedFowardBlock(
          (ff_1): Linear(in_features=32, out_features=4, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (ff_2): Linear(in_features=4, out_features=32, bias=True)
        )
        (norm_1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm_2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (mlm_head): Linear(in_features=32, out_features=30000, bias=True)
)

In [12]:
tensor = torch.randn((MAX_LEN, 4, D_MODEL))  

In [16]:
mask = torch.tensor([False, False,  True, False, False,  True, False, False,  True, False,
        False, False,  True, False,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False,  True])

In [18]:
len(mask)

32

In [None]:
nn.MultiheadAttention()

In [13]:
tensor

tensor([[[-1.7828, -0.9739,  1.2806,  ..., -0.3408, -0.3639,  0.5790],
         [ 0.7138, -0.7522, -1.2334,  ..., -0.7771,  1.6783,  0.6866],
         [-1.1697, -0.6700, -0.9719,  ...,  1.2864, -0.2909,  2.4255],
         [ 0.2447, -0.7539,  2.0695,  ...,  1.6358, -0.5475,  2.3103]],

        [[-1.0699,  0.2901,  0.0422,  ...,  0.6257,  0.7616,  1.2917],
         [-0.4748, -0.6093,  1.7276,  ..., -0.6071,  1.2183, -1.6340],
         [ 1.4023, -0.3572,  0.4537,  ..., -0.3655, -1.3445, -1.0162],
         [ 0.7138, -0.8601,  1.9202,  ...,  0.1581, -0.4403, -0.7742]],

        [[-0.3864,  0.1110,  0.1057,  ...,  0.2295, -0.4114, -0.0696],
         [ 1.5512,  1.6452, -0.9075,  ...,  1.2767, -0.6229, -0.3755],
         [-0.0273,  0.7276,  0.4164,  ...,  0.2826, -0.1423,  1.1013],
         [-1.7906, -2.4617, -0.1253,  ...,  0.1300,  0.0429, -0.3763]],

        ...,

        [[ 0.7467,  1.5654, -0.0824,  ...,  0.0484,  1.3317, -0.3217],
         [-0.3378, -0.2106, -0.9854,  ...,  0.7427,  1.09

In [19]:
model(tensor, mask)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [24]:
import pandas as pd

In [25]:
pd.read_html('https://www.fundamentus.com.br/resultado.php')

ImportError: lxml not found, please install it