In [1]:
import torch
from torch import Tensor
import torch.nn as nn
import math
from tqdm import tqdm

In [2]:
from torchinfo import summary

In [3]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.set_printoptions(precision=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
# Configurações de treinamento e do modelo
SEQ_LEN = 12
D_MODEL = 8
N_HEADS = 2
VOCAB_SIZE = tokenizer.vocab_size
LR = 1e-3
LABELS = 3
Nx = 1

In [7]:
# textos com os labels 0: irritado, 1: neutro, 2: feliz
textos = (
    ("I can't believe you did that Idiot! How dare you!", 0),
    ("This is unacceptable! I'm furious! idiot", 0),
    ("The weather today is quite nice.", 1),
    ("I'm over the moon with joy! Everything is going my way!", 2),
    ("I'm ecstatic! Life couldn't be better!", 2)
)

In [8]:
textos_tokens = [(['[CLS]'] + tokenizer.tokenize(text), label) for text, label in textos]

In [9]:
data = [(tokenizer.convert_tokens_to_ids(tokens), label) for tokens, label in textos_tokens] 

In [12]:
# Função que trunca os textos e coloca o PAD token
def add_pad_token(list_tokens: list, seq_len: int = SEQ_LEN) -> list:
    if len(list_tokens) > seq_len:
        return list_tokens[:seq_len]
    else:
        return list_tokens + (seq_len - len(list_tokens)) * [tokenizer.pad_token_id]

In [13]:
data = [(add_pad_token(token_list), label) for token_list, label in data]
data

[([101, 1045, 2064, 1005, 1056, 2903, 2017, 2106, 2008, 10041, 999, 2129], 0),
 ([101, 2023, 2003, 21873, 999, 1045, 1005, 1049, 9943, 999, 10041, 0], 0),
 ([101, 1996, 4633, 2651, 2003, 3243, 3835, 1012, 0, 0, 0, 0], 1),
 ([101, 1045, 1005, 1049, 2058, 1996, 4231, 2007, 6569, 999, 2673, 2003], 2),
 ([101, 1045, 1005, 1049, 14925, 16677, 999, 2166, 2481, 1005, 1056, 2022], 2)]

In [14]:
# transformando em tensor para o pytorch
data = [(torch.tensor(token_list), torch.tensor(label)) for token_list, label in data]

In [15]:
class EncoderClassifier(nn.Module):
    def __init__(
        self,
        vocab_size = VOCAB_SIZE,
        d_model = D_MODEL,
        nx = Nx,
        nhead = N_HEADS,
        dim_feedforward = D_MODEL * 2,
        labels = LABELS
    ):
        super().__init__()
        self.embed_layer = nn.Embedding(vocab_size, d_model, padding_idx=tokenizer.pad_token_id)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=10,
            norm_first=True,
            batch_first=True,
            activation="gelu",
            dropout=0
        )
        self.encoder_block = nn.TransformerEncoder(self.encoder_layer, num_layers=nx)
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_model * 2),
            nn.ReLU(),
            nn.Linear(d_model * 2, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )
        self.output_layer = nn.Linear(d_model, labels)

    def forward(self, x):
        x = self.embed_layer(x)
        x = self.encoder_block(x)
        x = x[0, :]
        x = self.linear(x)
        x = self.output_layer(x)
        return x

In [16]:
model = EncoderClassifier()

In [19]:
summary(model)

Layer (type:depth-idx)                                            Param #
EncoderClassifier                                                 --
├─Embedding: 1-1                                                  244,176
├─TransformerEncoderLayer: 1-2                                    --
│    └─MultiheadAttention: 2-1                                    216
│    │    └─NonDynamicallyQuantizableLinear: 3-1                  72
│    └─Linear: 2-2                                                90
│    └─Dropout: 2-3                                               --
│    └─Linear: 2-4                                                88
│    └─LayerNorm: 2-5                                             16
│    └─LayerNorm: 2-6                                             16
│    └─Dropout: 2-7                                               --
│    └─Dropout: 2-8                                               --
├─TransformerEncoder: 1-3                                         --
│    └─ModuleList: 2-9 

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [21]:
model.to(device)

EncoderClassifier(
  (embed_layer): Embedding(30522, 8, padding_idx=0)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
    )
    (linear1): Linear(in_features=8, out_features=10, bias=True)
    (dropout): Dropout(p=0, inplace=False)
    (linear2): Linear(in_features=10, out_features=8, bias=True)
    (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0, inplace=False)
    (dropout2): Dropout(p=0, inplace=False)
  )
  (encoder_block): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
        )
        (linear1): Linear(in_features=8, out_features=10, bias=True)
        (dropout): Dropout(p=0, inplac

In [22]:
model(data[0][0].to(device))

tensor([-0.000, -0.124, -0.139], device='cuda:0', grad_fn=<AddBackward0>)

In [23]:
def acc_metric(label, output):
    output = torch.argmax(output, dim=-1)
    return (label == output).float().mean().detach().item()

In [24]:
# realizando o treinamento
model.train()
ITERACOES = 1_000
iterator = tqdm(range(ITERACOES))
for _ in iterator:
    for X, y in data:
        optimizer.zero_grad()
        X, y = X.to(device), y.to(device)
        y_hat = model(X)
        loss = criterion(y_hat, y)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        acc = acc_metric(y, y_hat)
    iterator.set_postfix({"loss": f"{loss.item():6.3f}", "accuracy": f"{acc:.3f}"})
    if loss.item() <= 0.01:
        break

 45%|███████████████████████▏                           | 454/1000 [00:13<00:16, 33.86it/s, loss=0.010, accuracy=1.000]


In [28]:
for i, (X, y) in enumerate(data):
    X, y = X.to(device), y.to(device)
    pred = model(X)
    print('Texto passado:')
    print(textos[i][0])
    print(f'label: {y}')
    print(f'Resultado do modelo: {torch.argmax(pred).item()}')
    print('Resultado do modelo em probas: ')
    pred_prob = torch.softmax(pred, dim=-1)
    print(torch.round(pred_prob, decimals=4))
    print()

Texto passado:
I can't believe you did that Idiot! How dare you!
label: 0
Resultado do modelo: 0
Resultado do modelo em probas: 
tensor([1., 0., 0.], device='cuda:0', grad_fn=<RoundBackward1>)

Texto passado:
This is unacceptable! I'm furious! idiot
label: 0
Resultado do modelo: 0
Resultado do modelo em probas: 
tensor([1., 0., 0.], device='cuda:0', grad_fn=<RoundBackward1>)

Texto passado:
The weather today is quite nice.
label: 1
Resultado do modelo: 1
Resultado do modelo em probas: 
tensor([2.000e-04, 9.998e-01, 0.000e+00], device='cuda:0',
       grad_fn=<RoundBackward1>)

Texto passado:
I'm over the moon with joy! Everything is going my way!
label: 2
Resultado do modelo: 2
Resultado do modelo em probas: 
tensor([0.004, 0.005, 0.990], device='cuda:0', grad_fn=<RoundBackward1>)

Texto passado:
I'm ecstatic! Life couldn't be better!
label: 2
Resultado do modelo: 2
Resultado do modelo em probas: 
tensor([0.004, 0.005, 0.990], device='cuda:0', grad_fn=<RoundBackward1>)

