In [1]:
import math
import numpy as np
from pathlib import Path

In [2]:
from datasets import load_dataset
from tokenizers import Tokenizer
from transformers import BertTokenizer

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader, Dataset
from torchinfo import summary
from torchmetrics import Accuracy

In [4]:
from utils.training_loop import eval_model, train_model

In [5]:
# Verificar o número de GPUs disponíveis
num_gpus = torch.cuda.device_count()
print(f"Número de GPUs disponíveis: {num_gpus}")

# Obter informações detalhadas sobre cada GPU
for i in range(num_gpus):
    print(f"--- GPU {i} ---")
    print(f"Nome: {torch.cuda.get_device_name(i)}")
    print(f"Memória total: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GB")
    print(f"Memória disponível: {torch.cuda.memory_allocated(i) / (1024**3):.2f} GB")
    print(f"Memória reservada: {torch.cuda.memory_reserved(i) / (1024**3):.2f} GB")
    print(f"Capacidade de Computação: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
    print()

Número de GPUs disponíveis: 1
--- GPU 0 ---
Nome: NVIDIA L4
Memória total: 22.17 GB
Memória disponível: 0.00 GB
Memória reservada: 0.00 GB
Capacidade de Computação: 8.9



In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'DEVICE: {device}')

DEVICE: cuda


In [7]:
dataset = load_dataset("yelp_review_full")

# Acessar os dados
train_data = dataset["train"]
test_data = dataset["test"]

# Exibir informações sobre o conjunto de dados
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [8]:
# looking at the text
train_data['text'][0]

"dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."

In [9]:
# Calculating the average of texts
mean_words_text = np.mean(
    list(map(lambda x: len(x.split()), train_data['text']))
)

std_words_text = np.std(
    list(map(lambda x: len(x.split()), train_data['text']))
)

In [10]:
print(f'Média de palavras por texto: {mean_words_text:.2f}.')
print(f'Desvio padrão de palavras por texto: {std_words_text:.2f}.')

Média de palavras por texto: 134.10.
Desvio padrão de palavras por texto: 121.40.


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [12]:
text = train_data['text'][13]
print(f'TEXT: \n{text}')
tokens = tokenizer.tokenize(text)
print(f'TOKENS: \n{tokens}')
tokens_ids = tokenizer.encode(text, add_special_tokens=True)
print(f'TOKENS IDS: \n{tokens_ids}')

TEXT: 
After waiting for almost 30 minutes to trade in an old phone part of the buy back program, our customer service rep incorrectly processed the transaction. This led to us waiting another 30 minutes for him to correct it. Don't visit this store if you want pleasant or good service.
TOKENS: 
['after', 'waiting', 'for', 'almost', '30', 'minutes', 'to', 'trade', 'in', 'an', 'old', 'phone', 'part', 'of', 'the', 'buy', 'back', 'program', ',', 'our', 'customer', 'service', 'rep', 'incorrectly', 'processed', 'the', 'transaction', '.', 'this', 'led', 'to', 'us', 'waiting', 'another', '30', 'minutes', 'for', 'him', 'to', 'correct', 'it', '.', 'don', "'", 't', 'visit', 'this', 'store', 'if', 'you', 'want', 'pleasant', 'or', 'good', 'service', '.']
TOKENS IDS: 
[101, 2044, 3403, 2005, 2471, 2382, 2781, 2000, 3119, 1999, 2019, 2214, 3042, 2112, 1997, 1996, 4965, 2067, 2565, 1010, 2256, 8013, 2326, 16360, 19721, 13995, 1996, 12598, 1012, 2023, 2419, 2000, 2149, 3403, 2178, 2382, 2781, 2005, 20

In [26]:
SEQ_LEN = 256  # nº de tokens de entrada do modelo
D_MODEL = 64  # nº de dimensões de embedding
N_HEADS = 4  # nº de cabeças utilizadas no multi-head attention
Nx = 2  # nº de vezes que é repassado no multi-head attention
N_OUTPUT = 5  # nº de classes de saida
VOCAB_SIZE = tokenizer.vocab_size  # vocab size
LR = 1e-5  # Learning Rate
BATCH_SIZE = 32  # Batch Size
EPOCHS = 5  # épocas de trainamento

In [14]:
class YelpReviewFullDataset(Dataset):
    def __init__(
        self,
        data, 
        tokenizer: Tokenizer = tokenizer, 
        seq_len: int = SEQ_LEN
    ) -> None:
        self.data = data
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self) -> int:
        return self.data.num_rows
        
    def __getitem__(self, id_i) -> dict[int, list[int]]:
        item = self.data[id_i]
        label, text = item['label'], item['text']
        tokens_list = tokenizer.encode(
            text,
            max_length=self.seq_len,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',  # Adiciona padding até max_length
            return_tensors='pt' 
        )
        
        return {
            'label': label, 
            'tokens': tokens_list.squeeze(0), 
            'text': text
        }

In [15]:
train_dataset = YelpReviewFullDataset(train_data)
test_dataset = YelpReviewFullDataset(test_data)

In [16]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)

***

# MODEL

In [17]:
class PositionalEncoding(nn.Module):
    """
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, seq_len, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)

In [18]:
class YepReviewModel(nn.Module):
    def __init__(
        self,
        seq_len: int = SEQ_LEN,
        d_model: int = D_MODEL,
        vocab_size: int = VOCAB_SIZE,
        num_heads: int = N_HEADS,
        n_x: int = Nx,
        dropout: float = 0.1,
        n_outputs: int = N_OUTPUT
    ):
        super().__init__()
        # configurações do modelo
        self.seq_len = seq_len
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.n_x = n_x
        self.dropout = dropout
        self.n_outputs = n_outputs

        # componentes
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size, 
            embedding_dim=self.d_model, 
            padding_idx=0
        )
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.d_model, 
            nhead=self.num_heads,
            dropout=self.dropout, 
            norm_first=True, 
            batch_first=True,
            activation="gelu"
        )
        self.encoder_block = nn.TransformerEncoder(self.encoder_layer, num_layers=self.n_x)
        
        self.linear_layer = nn.Sequential(
            nn.Linear(self.d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8)
        )
        self.output_layer = nn.Linear(8, self.n_outputs)
        self.init_weights()
    
    def init_weights(self):
        for layer in self.linear_layer:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.encoder_block(x)
        # Pegando a representação vetorial do token <CLS>
        x = x[:, 0, :]
        x = self.linear_layer(x)
        # x = F.layer_norm(x, x.size()[1:])
        x = self.output_layer(x)
        return x

In [19]:
model = YepReviewModel().to(device)



In [20]:
try:
    model.load_state_dict(torch.load('models/yepreview_model_.pth', map_location=device))
    model.to(device)
except Exception as e:
    print(e)

In [21]:
summary(model)

Layer (type:depth-idx)                                            Param #
YepReviewModel                                                    --
├─Embedding: 1-1                                                  1,953,408
├─TransformerEncoderLayer: 1-2                                    --
│    └─MultiheadAttention: 2-1                                    12,480
│    │    └─NonDynamicallyQuantizableLinear: 3-1                  4,160
│    └─Linear: 2-2                                                133,120
│    └─Dropout: 2-3                                               --
│    └─Linear: 2-4                                                131,136
│    └─LayerNorm: 2-5                                             128
│    └─LayerNorm: 2-6                                             128
│    └─Dropout: 2-7                                               --
│    └─Dropout: 2-8                                               --
├─TransformerEncoder: 1-3                                         --
│  

In [24]:
from torch.optim.lr_scheduler import OneCycleLR

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
scheduler = OneCycleLR(optimizer, max_lr=LR, 
                       steps_per_epoch=len(train_loader), 
                       epochs=EPOCHS)

In [28]:
loss_eval_list = train_model(
    model,
    train_loader,
    test_loader,
    epochs=EPOCHS,
    scheduler=scheduler,
    optimizer=optimizer,
    criterion=criterion
)

Epoch 1/5:  34%|███▎      | 6816/20313 [08:11<16:13, 13.86it/s, loss=0.9342, accuracy=0.5954]


KeyboardInterrupt: 

In [None]:
acc_test = eval_model(model, test_loader)
print(acc_test)

# TODO
- [ ] Olhar as métricas
- [ ] Olhar o notebook do chary
- [ ] Melhorar o modelo
- [ ] Arrumar o notebook
- [ ] Fazer uma visualização com o umap