In [81]:
import torch
import torch.nn as nn

In [82]:
from multi_head_attention import MultiHeadAttention
from feed_forward import FeedForward
from sinusoidal_positional_encoding import PositionalEncoding
from embedding import EmbeddingModel

In [83]:
class EncoderBlock(nn.Module):
    def __init__(
        self, 
        d_model: int, 
        n_heads: int,
        hidden_size: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.feed_forward = FeedForward(d_model, hidden_size, dropout)

        self.norm_mha = nn.LayerNorm(normalized_shape=d_model)
        self.norm_feed_forward = nn.LayerNorm(normalized_shape=d_model)

    def forward(self, x):
        x = self.norm_mha(x + self.mha(x, x, x))  # add & norm
        x = self.norm_feed_forward(x + self.feed_forward(x))  # add & norm
        return x

In [90]:
class TransformerEncoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        seq_len: int,
        d_model: int,
        nx: int,
        n_heads: int,
        hidden_size: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Sequential(
            EmbeddingModel(d_model, vocab_size),
            PositionalEncoding(d_model, seq_len, dropout=0.0)
        )
        self.encoder_blocks = nn.ModuleList(
            [
                EncoderBlock(
                    d_model,
                    n_heads,
                    hidden_size,
                    dropout=dropout,
                )
                for _ in range(nx)
            ]
        )

    def forward(self, x):
        x = self.embedding(x)
        for block in self.encoder_blocks:
            x = block(x)
        return x

In [91]:
x = torch.rand((2, 3)) * 100
x = x.int()

In [92]:
print(f"entrada teste")
print(x)
print(f"Shape entrada: {x.shape}")
print("-" * 30)

entrada teste
tensor([[90, 75, 94],
        [88, 54, 68]], dtype=torch.int32)
Shape entrada: torch.Size([2, 3])
------------------------------


In [93]:
encoder = TransformerEncoder(
    vocab_size=1000,
    seq_len=10,
    d_model=6,
    nx=2,
    n_heads=2,
    hidden_size=2
)

In [94]:
x_encoder = encoder(x)

In [95]:
print(f"Shape saida {x_encoder.shape}")

Shape saida torch.Size([2, 3, 6])


In [96]:
x_encoder

tensor([[[ 0.8063,  0.0559, -1.1969,  1.6378, -1.0891, -0.2140],
         [ 0.7452, -1.7216,  0.1039, -0.4154, -0.2128,  1.5007],
         [ 1.1609, -1.5272,  0.4636,  0.6909, -1.2055,  0.4173]],

        [[-1.1207, -0.4452, -0.0639,  1.4266, -1.0133,  1.2165],
         [-0.7809, -1.0704,  1.7244, -0.2529, -0.5634,  0.9431],
         [ 0.3675, -1.5602,  1.1221,  0.6448, -1.1790,  0.6048]]],
       grad_fn=<NativeLayerNormBackward0>)

# Criando o EncoderDataset para o pré treinamento

In [1]:
from tokenizer.Tokenizer import TokenizerImDB
from tokenizer.Tokenizer import SpecialTokensInt

In [2]:
SpecialTokensInt.tolist()

['PAD', 'CLS', 'UNK', 'MASK', 'SOS', 'EOS']

In [80]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [6]:
from pathlib import Path

In [7]:
import pandas as pd

In [8]:
file = Path('../../data/imdb-reviews-pt-br.csv')
pd.read_csv(file)

Unnamed: 0,id,text_en,text_pt,sentiment
0,1,Once again Mr. Costner has dragged out a movie...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg
1,2,This is an example of why the majority of acti...,Este é um exemplo do motivo pelo qual a maiori...,neg
2,3,"First of all I hate those moronic rappers, who...","Primeiro de tudo eu odeio esses raps imbecis, ...",neg
3,4,Not even the Beatles could write songs everyon...,Nem mesmo os Beatles puderam escrever músicas ...,neg
4,5,Brass pictures movies is not a fitting word fo...,Filmes de fotos de latão não é uma palavra apr...,neg
...,...,...,...,...
49454,49456,"Seeing as the vote average was pretty low, and...","Como a média de votos era muito baixa, e o fat...",pos
49455,49457,"The plot had some wretched, unbelievable twist...",O enredo teve algumas reviravoltas infelizes e...,pos
49456,49458,I am amazed at how this movieand most others h...,Estou espantado com a forma como este filme e ...,pos
49457,49459,A Christmas Together actually came before my t...,A Christmas Together realmente veio antes do m...,pos


In [53]:
tokenizer_path_pt = Path('tokenizer/artifacts/tokenizer_pt.json')

tokenizer_pt = TokenizerImDB(vocab_size=30_000, tokenizer_path=tokenizer_path_pt)

tokenizer_pt.encoder('ola')

[13344]

In [76]:
class EncoderPreTrainDataset(Dataset):
    def __init__(
        self,
        tokenizer: TokenizerImDB,
        seq_len: int,
        mask_prob: float,
        file_dataset: Path,
        language: str,
        mask_token_id: int = SpecialTokensInt.MASK.value,
        pad_token_id: int = SpecialTokensInt.PAD.value,
        special_tokens: list[int] = SpecialTokensInt.tolist(),
    ):
        super().__init__()
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.mask_prob = mask_prob
        self.mask_token_id = mask_token_id
        self.pad_token_id = pad_token_id
        self.special_tokens = special_tokens
        self.vocab_size = tokenizer.vocab_size
        self.dataset = pd.read_csv(file_dataset)[f'text_{language}']

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        text = self.dataset.iloc[index]
        text_tokens = self.tokenizer.encoder(text)
        if len(text_tokens) < self.seq_len:
            diff = self.seq_len - len(text_tokens)
            text_tokens += [self.pad_token_id for _ in range(diff)]
        elif len(text_tokens) > self.seq_len:
            text_tokens = text_tokens[: self.seq_len]

        text_tokens = torch.tensor(text_tokens)
        probs = torch.rand(text_tokens.shape)  # cria uma matriz de probas
        mask = (probs < self.mask_prob) * (text_tokens != self.pad_token_id)  # [True * True = 1], [True * False = 0], [False, False = 0]
        for special_token in self.special_tokens:
            mask = mask * (text_tokens != special_token)

        masked = torch.clone(text_tokens).type(torch.int)
        masked_ids = torch.flatten(mask.nonzero())  # -> retorna os ids que foram mascarados
        masked_ids_list = masked_ids.tolist()
        original_masked_tokens = text_tokens[masked_ids_list]  # -> retorna os verdadeiroas ids antes de serem substituidos por MASK
        replace_masked_tokens = self.generate_mlm_tokens(original_masked_tokens.tolist())
        masked[masked_ids_list] = replace_masked_tokens
        return masked, text_tokens, mask

    def generate_mlm_tokens(self, original_tokens: list[int]):
        len_original_tokens = len(original_tokens)
        replace_tokens = torch.rand(len_original_tokens)
        for i in range(len_original_tokens):
            if replace_tokens[i] <= 0.8:  # se esses caras forem menores do que 80% vira MASK
                replace_tokens[i] = self.mask_token_id
            elif 0.8 < replace_tokens[i] <= 0.9:
                replace_tokens[i] = np.random.randint(self.vocab_size)
            else:
                replace_tokens[i] = original_tokens[i]

        return replace_tokens.type(torch.int)

In [77]:
dataset = EncoderPreTrainDataset(
    tokenizer=tokenizer_pt,
    seq_len=124,
    mask_prob=0.15,
    file_dataset=Path('../../data/imdb-reviews-pt-br.csv'),
    language='pt'
)

In [78]:
len(dataset)

49459

In [79]:
dataset[0]

(tensor([ 1336,   191,   376,    16,   140,     3,     3, 15555,  6777,  2437,
           166,   176,   215,   265,   258,     3,   195,     3,   140,  2068,
           125,   346,    18,   537,   125,    57,   476,   925,   125,   619,
          2103,  1376,    49,   188,  1073,   132,  6898,   206,   682,     3,
           476,  1906,   300,   125,   265,     3,    16,   199,   833,   164,
             3,    59,   190, 10662,   167,   941,   295,   505,     3,   124,
           935,   132,   206,   125,    63,   343,  4952,   206,  1868,   125,
           346,     3,   122,   140,   617,  1470,  2366,   122,   125,  7490,
          1934,   206,     3,   125,   333,    16,   122,   513,  4777,   456,
           125,   265,   258,  1514,    16,   358,   199,   164,   143,    59,
         23809,  6475,    18,   140,   617,   167,   140,   393,  9554,   125,
          2528,   437,  4303,   122,   125,   265,  5765,   122, 17111,   434,
           819,   149,    16, 18725], dtype=torch.in

In [117]:
class EncoderMLM(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        seq_len: int,
        d_model: int,
        nx: int,
        n_heads: int,
        hidden_size: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.d_model = d_model
        self.encoder = TransformerEncoder(
            vocab_size=vocab_size,
            seq_len=seq_len,
            d_model=d_model,
            nx=nx,
            n_heads=n_heads,
            hidden_size=hidden_size,
            dropout=0.1
        )

        self.mlm_head = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask):
        # Flattens the masked ID
        masked_ids = torch.flatten(mask.reshape((-1,)).nonzero())
        # Encoder output
        last_hidden_states = self.encoder(x)
        
        # Validations
        assert mask.numel() == last_hidden_states.shape[0] * last_hidden_states.shape[1], \
            f"Mismatch: mask {mask.numel()} vs encoder {last_hidden_states.numel() // self.d_model}"
        
        # Flatten the hidden states
        all_hidden_states = last_hidden_states.reshape(-1, self.d_model)
        
        # Ensure valid indices
        masked_ids = masked_ids[masked_ids < all_hidden_states.size(0)]
        
        # Get only the masked hidden states
        masked_hidden_states = all_hidden_states[masked_ids, :]
        
        # Predicts only the masked tokens
        logits = self.mlm_head(masked_hidden_states)
        return logits

In [118]:
modelo = EncoderMLM(
    vocab_size=tokenizer_pt.vocab_size,
    seq_len=124,
    d_model=16,
    nx=3,
    n_heads=4,
    hidden_size=248,
)

In [119]:
modelo.encoder(x)

tensor([[[-1.4128e+00,  1.3957e-01, -3.0033e-01, -8.1843e-02, -3.6328e-01,
          -1.2655e+00,  1.9240e+00, -4.2766e-01,  1.6582e-01,  1.4895e+00,
          -4.3955e-01,  1.4440e+00, -1.3942e+00, -1.5491e-01, -5.4119e-01,
           1.2183e+00],
         [-1.3192e+00,  1.0264e+00,  2.3589e-01,  2.0151e+00,  3.6934e-01,
           1.1888e+00, -2.1329e+00, -4.6241e-01, -1.1260e-01, -1.2224e+00,
           2.4236e-01,  1.3585e-01, -1.8543e-01,  6.6822e-01,  3.2967e-01,
          -7.7669e-01],
         [-3.9279e-01, -2.1078e+00,  1.1043e-01, -4.1354e-01, -6.4120e-01,
           2.3036e+00,  2.7222e-01,  1.0344e+00,  6.0608e-01, -3.2543e-01,
          -8.3260e-01,  1.5305e+00, -3.2007e-01,  1.0080e-01, -8.5521e-01,
          -6.9399e-02]],

        [[ 5.2024e-01, -5.6726e-02,  3.2182e-01,  9.3862e-01, -1.3859e+00,
           1.6053e-02, -1.4036e-01,  9.6060e-01, -1.2677e+00,  1.0706e+00,
           1.1306e+00,  8.1581e-01, -2.2246e+00,  3.2285e-01,  3.8517e-01,
          -1.4070e+00],
  

In [120]:
# pq o adamW
optimizer = torch.optim.AdamW(modelo.parameters(), lr=5e-5, weight_decay=1e-5)
dataloader = torch.utils.data.DataLoader(
    dataset, num_workers=8, shuffle=True, batch_size=4
)

In [121]:
for X, y, masked_mask in dataloader:
    masked_ids = torch.flatten(masked_mask.reshape((-1,)).nonzero())
    labels = y.reshape((-1,))[masked_ids]
    out = modelo(x, masked_mask)
    loss = torch.nn.functional.cross_entropy(out, labels)
    print(loss.items())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

AssertionError: Mismatch: mask 496 vs encoder 6