In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, BertTokenizer
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch import Tensor
import numpy as np
import torch.nn as nn
import torch
from torchinfo import summary
import math
import torch.nn.functional as F

In [3]:
from tqdm import tqdm
import torchmetrics

In [4]:
# download dataset
dataset = load_dataset("AiresPucrs/google-play-apps-review-pt")

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("pierreguillou/bert-base-cased-squad-v1.1-portuguese")

In [8]:
texto = 'Esse texto que será tokenizado.'

In [9]:
tokens = tokenizer.tokenize(texto)
print(f'TOKENS BERT BASE PT-BT: {tokens}')

TOKENS BERT BASE PT-BT: ['Esse', 'texto', 'que', 'será', 'to', '##ken', '##izado', '.']


In [10]:
VOCAB_SIZE = tokenizer.vocab_size
VOCAB_SIZE

29794

In [11]:
token_ids = tokenizer.encode(texto, add_special_tokens=True)
print(f'TOKENS IDS: {token_ids}')

TOKENS IDS: [101, 3758, 4054, 179, 2810, 374, 8110, 2303, 119, 102]


In [12]:
token_ids = tokenizer.encode(texto, add_special_tokens=False)
print(f'TOKENS IDS: {token_ids}')

TOKENS IDS: [3758, 4054, 179, 2810, 374, 8110, 2303, 119]


# Model Config

In [13]:
VOCAB_SIZE = tokenizer.vocab_size
SEQ_LEN = 124
D_MODEL = 64
N_LAYERS = 6
N_HEADS = 4
N_OUTPUT = 1
HIDDEN_SIZE = 512
DROPOUT = 0.1
BATCH_SIZE = 32
EPOCHS = 50
DROPOUT = 0.10
LR = 1e-4

# Dataset

In [14]:
tokenizer.add_special_tokens({
    "bos_token": "[SOS]",  # Definir token de início de sentença
    "eos_token": "[EOS]"   # Definir token de fim de sentença
})
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2

In [33]:
class GoogleDataset(Dataset):
    def __init__(
        self,
        dataset,
        tokenizer: AutoTokenizer,
        max_len: int = SEQ_LEN,
    ):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.vocab_size = tokenizer.vocab_size

    def __len__(self):
        return self.dataset['train'].num_rows

    def __getitem__(self, index):
        text = self.dataset['train'][index]['review']
        tokenized = self.tokenizer.encode(text, add_special_tokens=False)
        if len(tokenized) < self.max_len + 1:
            # tokenized = tokenized.tolist()
            # porque colocar o <pad> antes do texto?
            tokenized = [self.tokenizer.bos_token_id] + tokenized + [self.tokenizer.eos_token_id]
            tokenized += [self.tokenizer.pad_token_id] * ((self.max_len + 1) - len(tokenized))
        else:
            tokenized = tokenized[:self.max_len + 1]
        tokenized = torch.tensor(tokenized)

        decoder_input = tokenized[: self.max_len]
        true_output = tokenized[1 : self.max_len + 1]
        # return decoder_input, true_output
        return {
            'tokens_input': decoder_input.to(device), 
            'tokens_output': true_output.to(device)
        }

In [34]:
train_dataset = GoogleDataset(dataset, tokenizer)

In [35]:
len(train_dataset[12]["tokens_output"])

124

In [36]:
# print(f'O que vai entrar no modelo: \n{train_dataset[12][0]}')
print(f'O que vai entrar no modelo: \n{train_dataset[12]["tokens_input"]}')
print('*'*30)
# print(f'O que deve sair do modelo: \n{train_dataset[12][1]}')
print(f'O que deve sair do modelo: \n{train_dataset[12]["tokens_output"]}')

O que vai entrar no modelo: 
tensor([16357, 19589, 22280, 15212,   230,  3138,   785,  3264,   179,  2686,
         7730,   122,   785,   123,  7122,  3138,   122,   123,  1457,   179,
         1815, 18029,   682,  3385,   125,  7480,  2446,   304, 22280,   180,
         3508,   125, 10109,   123,   681,  7480,  2446,   304, 22280,   122,
          123,   179,  1941,   122,  4666,  7480,  2446,   304, 22280,   173,
         2925,   240,   210,  5628, 18029,   222,   995,  1903,   125,  7480,
         2446,   304, 22280,  7480,  2446,   304, 22280,   173,  3846,   739,
         1903,   123,   547,   179,   122,  8876,   260, 18120, 22281,   171,
         2702,  6419, 15245,   146, 16045, 22279,   291,   123,  3652,   173,
         3674,   122,  2044,  4133,   123,  3524,   304, 22280,   171,  4938,
         2547,   373,   179,  1966,  1160,  2820,   338,   322, 14312,   128,
          221,   123,  2004, 22278,  6326,  8430,   222,  7480,   325,  6808,
          122, 15397, 20383,   178]

# Model

In [37]:
VOCAB_SIZE = tokenizer.vocab_size
SEQ_LEN = 124
D_MODEL = 64
N_LAYERS = 6
N_HEADS = 4
N_OUTPUT = 1
HIDDEN_SIZE = 512
DROPOUT = 0.1
N = 128
EPOCHS = 50
DROPOUT = 0.10
LR = 5e-5

In [38]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int = D_MODEL, dropout: float = 0.01, seq_len: int = SEQ_LEN):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        
        pe = torch.zeros(SEQ_LEN, D_MODEL)
        k = torch.arange(0, seq_len).unsqueeze(1) 
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10_000) / d_model))
        # sine for even indices
        pe[:, 0::2] = torch.sin(k * div_term)
        # cos for odd indices
        pe[:, 1::2] = torch.cos(k * div_term)
        # add batch dim
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x: Tensor):
        # print(f'Shape antes do PE: {x.shape}')
        # print(f'Shape do que será somado: {self.pe[:, :x.size(1)].requires_grad_(False).shape}')
        x += self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [39]:
class GooglePlayAppsReviewGenerator(nn.Module):
    def __init__(
        self,
        vocab_size=VOCAB_SIZE,
        seq_len=SEQ_LEN,
        d_model=D_MODEL,
        nx=N_LAYERS,
        num_heads=N_HEADS,
        hidden_size=HIDDEN_SIZE,
        dim_feedforward=HIDDEN_SIZE,
        dropout=DROPOUT,
        pad_token_id=0,
    ):
        super().__init__()

        self.seq_len = seq_len
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.nx = nx
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        
        self.embedding_layer = nn.Embedding(
            num_embeddings=vocab_size,  # número do maior token eos que foi adicionado + 1
            embedding_dim=self.d_model,
            padding_idx=0
        )
        self.positional_encoding = PositionalEncoding(
            d_model=self.d_model,
            dropout=self.dropout,
            seq_len=self.seq_len
        )

        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=self.d_model, 
            nhead=self.num_heads,
            dim_feedforward=self.dim_feedforward,
            dropout=self.dropout,
            norm_first=True,
            batch_first=True,
            activation='gelu'
        )
        self.decoder_block = nn.TransformerDecoder(
            self.decoder_layer,
            num_layers=self.nx,
        )
        self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.seq_len)
        self.layer_norm = nn.LayerNorm(normalized_shape=self.d_model)

        self.z_projection = nn.Linear(self.d_model, self.vocab_size)  # output projection
        

    def forward(self, tokens: Tensor, memory=None):
        x = self.embedding_layer(tokens)
        x = self.positional_encoding(x)
        if memory is None:
            memory = torch.zeros_like(x)
        x = self.decoder_block(
            x, 
            memory=memory, 
            tgt_mask=self.tgt_mask
        )
        x = self.layer_norm(x)
        x = self.z_projection(x)
        return x

    def generate_next_token(
        self,
        tokens: Tensor,
        memory=None, 
        temperature=1.0
    ):
        logits = self.forward(tokens, memory)
        last_token_logits = logits[:, -1, :] / temperature

        # sample
        next_token = torch.multinomial(
            F.softmax(last_token_logits, dim=-1),
            num_samples=1
        )
        return next_token

In [40]:
model = GooglePlayAppsReviewGenerator().to(device)

In [41]:
summary(model)

Layer (type:depth-idx)                                            Param #
GooglePlayAppsReviewGenerator                                     --
├─Embedding: 1-1                                                  1,906,816
├─PositionalEncoding: 1-2                                         --
│    └─Dropout: 2-1                                               --
├─TransformerDecoderLayer: 1-3                                    --
│    └─MultiheadAttention: 2-2                                    12,480
│    │    └─NonDynamicallyQuantizableLinear: 3-1                  4,160
│    └─MultiheadAttention: 2-3                                    12,480
│    │    └─NonDynamicallyQuantizableLinear: 3-2                  4,160
│    └─Linear: 2-4                                                33,280
│    └─Dropout: 2-5                                               --
│    └─Linear: 2-6                                                32,832
│    └─LayerNorm: 2-7                                             128

In [49]:
LR = 5e-4

In [50]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [43]:
dataloader = DataLoader(
    train_dataset,
    num_workers=0, 
    shuffle=True, 
    batch_size=N
)

In [44]:
EPOCHS

50

In [45]:
for epoch in range(EPOCHS):
    model.train()

    for batch, tokens in enumerate(dataloader):
        X, Y = tokens['tokens_input'], tokens['tokens_output']
        Y_hat = model(X)        
        Y_hat_flat = Y_hat.view(-1, Y_hat.size(-1))
        Y_flat = Y.view(-1)
        loss = criterion(Y_hat_flat, Y_flat)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        break
    break

In [46]:
loss

tensor(10.3753, device='cuda:0', grad_fn=<NllLossBackward0>)

In [47]:
Y_hat.shape

torch.Size([128, 124, 29794])

In [51]:
# Inicializa métricas
accuracy_metric = torchmetrics.Accuracy(task="multiclass", num_classes=VOCAB_SIZE).to(device)
loss_metric = torchmetrics.MeanMetric().to(device)

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    # Usa tqdm para exibir progresso da época
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for batch, tokens in enumerate(progress_bar):
        X, Y = (
            tokens['tokens_input'].to(device),
            tokens['tokens_output'].to(device)
        )
        
        # Forward pass
        Y_hat = model(X)
        
        # Ajuste para CrossEntropyLoss
        Y_hat_flat = Y_hat.view(-1, Y_hat.size(-1))  # [batch_size * seq_len, vocab_size]
        Y_flat = Y.view(-1)  # [batch_size * seq_len]

        # Calcula loss e acurácia
        loss = criterion(Y_hat_flat, Y_flat)
        acc = accuracy_metric(Y_hat_flat.argmax(dim=-1), Y_flat)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Atualiza métricas
        loss_metric.update(loss)
        progress_bar.set_postfix(loss=loss.item(), acc=acc.item())

    # Mostra métricas finais da época
    epoch_loss = loss_metric.compute().item()
    epoch_acc = accuracy_metric.compute().item()
    print(f"Epoch {epoch+1}: Loss = {epoch_loss:.4f}, Acc = {epoch_acc:.4f}")

    # Reseta métricas para próxima época
    accuracy_metric.reset()
    loss_metric.reset()

Epoch 1/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.461, loss=3.23]


Epoch 1: Loss = 3.2992, Acc = 0.4649


Epoch 2/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.453, loss=3.28]


Epoch 2: Loss = 3.2352, Acc = 0.4691


Epoch 3/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.49, loss=3.05] 


Epoch 3: Loss = 3.1836, Acc = 0.4724


Epoch 4/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.463, loss=3.26]


Epoch 4: Loss = 3.1416, Acc = 0.4751


Epoch 5/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.498, loss=2.98]


Epoch 5: Loss = 3.1047, Acc = 0.4775


Epoch 6/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.43, loss=3.38] 


Epoch 6: Loss = 3.0756, Acc = 0.4797


Epoch 7/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.493, loss=3.02]


Epoch 7: Loss = 3.0470, Acc = 0.4818


Epoch 8/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.47, loss=3.08] 


Epoch 8: Loss = 3.0229, Acc = 0.4834


Epoch 9/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.471, loss=3.11]


Epoch 9: Loss = 3.0013, Acc = 0.4851


Epoch 10/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.49, loss=2.98] 


Epoch 10: Loss = 2.9807, Acc = 0.4867


Epoch 11/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.532, loss=2.69]


Epoch 11: Loss = 2.9602, Acc = 0.4886


Epoch 12/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.495, loss=2.98]


Epoch 12: Loss = 2.9445, Acc = 0.4898


Epoch 13/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.494, loss=2.85]


Epoch 13: Loss = 2.9278, Acc = 0.4913


Epoch 14/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.502, loss=2.86]


Epoch 14: Loss = 2.9123, Acc = 0.4927


Epoch 15/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.528, loss=2.71]


Epoch 15: Loss = 2.8967, Acc = 0.4943


Epoch 16/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.484, loss=2.95]


Epoch 16: Loss = 2.8851, Acc = 0.4954


Epoch 17/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.463, loss=3.13]


Epoch 17: Loss = 2.8732, Acc = 0.4966


Epoch 18/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.523, loss=2.72]


Epoch 18: Loss = 2.8591, Acc = 0.4976


Epoch 19/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.504, loss=2.83]


Epoch 19: Loss = 2.8477, Acc = 0.4990


Epoch 20/50: 100%|██████████| 157/157 [00:45<00:00,  3.42it/s, acc=0.51, loss=2.74] 


Epoch 20: Loss = 2.8368, Acc = 0.5000


Epoch 21/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.52, loss=2.71] 


Epoch 21: Loss = 2.8264, Acc = 0.5011


Epoch 22/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.476, loss=2.99]


Epoch 22: Loss = 2.8177, Acc = 0.5020


Epoch 23/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.537, loss=2.58]


Epoch 23: Loss = 2.8061, Acc = 0.5028


Epoch 24/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.494, loss=2.83]


Epoch 24: Loss = 2.7988, Acc = 0.5036


Epoch 25/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.487, loss=2.91]


Epoch 25: Loss = 2.7903, Acc = 0.5046


Epoch 26/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.539, loss=2.58]


Epoch 26: Loss = 2.7807, Acc = 0.5054


Epoch 27/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.511, loss=2.81]


Epoch 27: Loss = 2.7739, Acc = 0.5062


Epoch 28/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.49, loss=2.93] 


Epoch 28: Loss = 2.7675, Acc = 0.5070


Epoch 29/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.504, loss=2.79]


Epoch 29: Loss = 2.7597, Acc = 0.5074


Epoch 30/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.534, loss=2.52]


Epoch 30: Loss = 2.7514, Acc = 0.5081


Epoch 31/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.459, loss=2.99]


Epoch 31: Loss = 2.7473, Acc = 0.5089


Epoch 32/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.525, loss=2.72]


Epoch 32: Loss = 2.7395, Acc = 0.5092


Epoch 33/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.492, loss=2.83]


Epoch 33: Loss = 2.7333, Acc = 0.5103


Epoch 34/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.508, loss=2.75]


Epoch 34: Loss = 2.7275, Acc = 0.5107


Epoch 35/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.519, loss=2.65]


Epoch 35: Loss = 2.7216, Acc = 0.5111


Epoch 36/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.471, loss=2.96]


Epoch 36: Loss = 2.7171, Acc = 0.5119


Epoch 37/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.525, loss=2.63]


Epoch 37: Loss = 2.7102, Acc = 0.5123


Epoch 38/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.514, loss=2.65]


Epoch 38: Loss = 2.7053, Acc = 0.5128


Epoch 39/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.516, loss=2.67]


Epoch 39: Loss = 2.7005, Acc = 0.5133


Epoch 40/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.515, loss=2.65]


Epoch 40: Loss = 2.6952, Acc = 0.5138


Epoch 41/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.473, loss=2.95]


Epoch 41: Loss = 2.6919, Acc = 0.5141


Epoch 42/50: 100%|██████████| 157/157 [00:45<00:00,  3.44it/s, acc=0.506, loss=2.7] 


Epoch 42: Loss = 2.6869, Acc = 0.5147


Epoch 43/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.54, loss=2.53] 


Epoch 43: Loss = 2.6812, Acc = 0.5152


Epoch 44/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.486, loss=2.86]


Epoch 44: Loss = 2.6789, Acc = 0.5154


Epoch 45/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.516, loss=2.64]


Epoch 45: Loss = 2.6736, Acc = 0.5159


Epoch 46/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.518, loss=2.71]


Epoch 46: Loss = 2.6696, Acc = 0.5164


Epoch 47/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.516, loss=2.69]


Epoch 47: Loss = 2.6658, Acc = 0.5169


Epoch 48/50: 100%|██████████| 157/157 [00:45<00:00,  3.45it/s, acc=0.489, loss=2.81]


Epoch 48: Loss = 2.6623, Acc = 0.5171


Epoch 49/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.494, loss=2.8] 


Epoch 49: Loss = 2.6592, Acc = 0.5173


Epoch 50/50: 100%|██████████| 157/157 [00:45<00:00,  3.46it/s, acc=0.493, loss=2.82]

Epoch 50: Loss = 2.6555, Acc = 0.5179





In [52]:
PATH_MODEL = 'model/model0.pth'

In [53]:
torch.save(model.state_dict(), PATH_MODEL)