In [43]:
from pathlib import Path
from enum import Enum
from tqdm import tqdm
import math

In [44]:
from datasets import load_dataset

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset 

In [46]:
from torchinfo import summary

In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [48]:
dataset = load_dataset("yelp_review_full")

# Acessar os dados
train_data = dataset["train"]
test_data = dataset["test"]

# Exibir informações sobre o conjunto de dados
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


# Configs

In [49]:
class SpecialTokens(Enum):
    PAD = 0
    UNK = 1
    CLS = 2

special_tokens = SpecialTokens

In [74]:
SEQ_LEN = 256
D_MODEL = 64
N_HEADS = 8
Nx = 6
N_OUTPUT = 5
VOCAB_SIZE = 5_000
LR = 1e-5
BATCH_SIZE_TRAIN = 32
BATCH_SIZE_TEST = 32
EPOCHS = 20

# Tokenizer

In [75]:
def text_iterator(data):
    for text in data['text']:
        yield text.lower()

In [76]:
def word_level_tokenizer(data, path_tokenizer: Path = Path('tokenizer/tokenizer.json')):
    if not Path.exists(path_tokenizer):
        tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
        tokenizer.pre_tokenizer = Whitespace()

        # Definindo o PAD token como o primeiro na lista de tokens especiais
        special_tokens = ["<PAD>", "<UNK>", "<CLS>"]

        trainer = BpeTrainer(special_tokens=special_tokens, min_frequency=5, vocab_size=VOCAB_SIZE)
        tokenizer.train_from_iterator(text_iterator(data), trainer=trainer)
        tokenizer.save(str(path_tokenizer))
    else:
        tokenizer = Tokenizer.from_file(str(path_tokenizer))

    return tokenizer

In [77]:
tokenizer = word_level_tokenizer(train_data)

In [78]:
print(f'VOCAB SIZE: {tokenizer.get_vocab_size()}')

VOCAB SIZE: 5000


In [79]:
tokenizer.encode('<PAD>').ids

[0]

In [80]:
tokenizer.encode('<CLS>').ids

[2]

# Dataset

In [81]:
from typing import Tuple, List

class YelpReviewFullDataset(Dataset):
    def __init__(self, data, tokenizer: Tokenizer = tokenizer, seq_len: int = SEQ_LEN) -> None:
        self.data = data
        self.tokenizer = tokenizer
        self.CLS_token_id = 2
        self.PAD_token_id = 0
        self.seq_len = seq_len

    def __len__(self) -> int:
        return self.data.num_rows
        
    def __getitem__(self, id_i) -> Tuple[int, List[int]]:
        item = self.data[id_i]
        label, text = item['label'], item['text'].lower()
        tokens_list = [self.CLS_token_id] + self.tokenizer.encode(text).ids
        tokens_list = self.truncate_seq(tokens_list)
        return {'label': label, 'tokens': torch.tensor(tokens_list), 'text': text}

    def truncate_seq(self, tokens_list: List[int]):
        len_token_list = len(tokens_list)
        if len_token_list > self.seq_len:
            return tokens_list[: self.seq_len]
        elif len_token_list < self.seq_len:
            return tokens_list + [self.PAD_token_id] * (self.seq_len - len_token_list)
        return tokens_list

In [82]:
train_dataset = YelpReviewFullDataset(train_data)
test_dataset = YelpReviewFullDataset(test_data)

In [83]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE_TRAIN)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE_TEST)

# Model

In [84]:
class PositionalEncoding(nn.Module):
    """
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, seq_len, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)

In [85]:
class YepReviewModel(nn.Module):
    def __init__(
        self,
        seq_len: int = SEQ_LEN,
        d_model: int = D_MODEL,
        vocab_size: int = VOCAB_SIZE,
        num_heads: int = N_HEADS,
        n_x: int = Nx,
        dropout: float = 0.0,
        n_outputs: int = N_OUTPUT
    ):
        super().__init__()
        # configurações do modelo
        self.seq_len = seq_len
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.n_x = n_x
        self.dropout = dropout
        self.n_outputs = n_outputs

        # componentes
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size, 
            embedding_dim=self.d_model, 
            padding_idx=SpecialTokens.PAD.value)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.d_model, 
            nhead=self.num_heads,
            dropout=self.dropout, 
            norm_first=True, 
            batch_first=True,
            activation="gelu"
        )
        self.encoder_block = nn.TransformerEncoder(self.encoder_layer, num_layers=self.n_x)
        self.linear_layer = nn.Sequential(
            nn.Linear(self.d_model, 124),
            nn.ReLU(),
            nn.Linear(124, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8)
        )
        self.output_layer = nn.Linear(8, self.n_outputs)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding_layer.weight.data.uniform_(-initrange, initrange)
        
        self.linear_layer[0].weight.data.uniform_(-initrange, initrange)
        self.linear_layer[0].bias.data.zero_()

        self.linear_layer[2].weight.data.uniform_(-initrange, initrange)
        self.linear_layer[2].bias.data.zero_()

        self.linear_layer[4].weight.data.uniform_(-initrange, initrange)
        self.linear_layer[4].bias.data.zero_()

        self.linear_layer[6].weight.data.uniform_(-initrange, initrange)
        self.linear_layer[6].bias.data.zero_()

        self.linear_layer[8].weight.data.uniform_(-initrange, initrange)
        self.linear_layer[8].bias.data.zero_()

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.encoder_layer(x)
        # Pegando a representação vetorial do token <CLS>
        x = x[:, 0, :]
        x = self.linear_layer(x)
        x = self.output_layer(x)
        return x

In [86]:
model = YepReviewModel().to(device)

In [87]:
summary(model)

Layer (type:depth-idx)                                            Param #
YepReviewModel                                                    --
├─Embedding: 1-1                                                  320,000
├─TransformerEncoderLayer: 1-2                                    --
│    └─MultiheadAttention: 2-1                                    12,480
│    │    └─NonDynamicallyQuantizableLinear: 3-1                  4,160
│    └─Linear: 2-2                                                133,120
│    └─Dropout: 2-3                                               --
│    └─Linear: 2-4                                                131,136
│    └─LayerNorm: 2-5                                             128
│    └─LayerNorm: 2-6                                             128
│    └─Dropout: 2-7                                               --
│    └─Dropout: 2-8                                               --
├─TransformerEncoder: 1-3                                         --
│    

In [88]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [89]:
def acc_metric(label, output):
    output = torch.argmax(output, dim=1)
    return (label == output).float().mean().detach().item()

In [90]:
next(iter(train_loader))

{'label': tensor([0, 3, 0, 0, 3, 2, 0, 4, 4, 3, 2, 1, 4, 2, 0, 4, 2, 3, 2, 0, 4, 0, 0, 3,
         1, 4, 4, 1, 3, 4, 1, 1]),
 'tokens': tensor([[   2,   47, 1911,  ...,    0,    0,    0],
         [   2,  244,  466,  ...,    0,    0,    0],
         [   2, 3123,  650,  ...,    0,    0,    0],
         ...,
         [   2,  852,   39,  ...,    0,    0,    0],
         [   2,  453,   79,  ...,    0,    0,    0],
         [   2,   47, 2683,  ...,   47,  626,  415]]),
 'text': ["i hate this place. its always a long wait & every time i've came here dr.trafreshi  just sits on his laptop hardly looking up.. he seems to not care by any means! and anytime i voice my concerns about something he not only minimized them but acts as if i make it up!",
  'great german food, portions are huge. the beers are brewed on the premises noisy place, great environment to hang  out with friends.',
  "2nd visit. great food menu when i visited the 1st time! however, probably never ever going to sit at the bar e

In [91]:
batch = next(iter(train_loader))
batch_labels, batch_tokens, batch_texts = batch['label'], batch['tokens'], batch['text']

In [92]:
print('#### TEXTO ####')
print(batch_texts[0])
print(f'## Label: {batch_labels[0]} ##')

#### TEXTO ####
chompies is great, i love the corned beef & pastrami sandwich..the food is very tastey..also if you want some amazing pancakes try the wheat stuffed granola ones they are out of this world.. on shea the chompies on shea has one problem, its the service, now the servers are nice & all but they seem to maybe be understaffed, takes forever to get your order. another thing that caused me to give it 3 stars is the price, chompies is pretty darn expensive, expect to pay about $15-$20 for a basic breakfast, i only order 2 eggs, potatos & dry toast & sometimes pancakes..when i go there for lunch & get the corned beef sandwish, fries, & a drink its about $25..but the portions are large & u will definitley not leave there hungry!
## Label: 2 ##


In [93]:
model(batch_tokens[:1, :].to(device))

tensor([[ 0.2251, -0.4110, -2.2903,  4.9426,  1.9228]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [95]:
batch_tokens[:2, :].unsqueeze(0)

tensor([[[   2,  547,  465,  635,   86,  244,   14,   47,  441,   70, 4983,
           907,    8, 4853,  644,  122,   70,  176,   86,  167,  692,   63,
           122,  338,  175,  117,  325,  196,  668, 2344,  347,   70, 4522,
          2393,  309, 3283,  649, 1332,  135,  140,  137,   96,  136, 1745,
           122,   78,  281,   39,   70,  547,  465,  635,   78,  281,   39,
           352,  149, 1183,   14,  483,   70,  242,   14,  267,   70, 1609,
           140,  339,    8,  145,  128,  135, 1241,   79,  744,   99,  667,
          4086,   14, 1786, 2220,   79,  186,  288,  326,   16,  538,  188,
           119,  265,  701,   92,   79,  536,   82,   21,  659,   86,   70,
           509,   14,  547,  465,  635,   86,  413, 4654, 1105,   14,  603,
            79,  629,  262,    6,  891, 4678,  720,  106,   39, 2454,  836,
            14,   47,  298,  326,   20, 1479,   14,  958,   57,    8, 1056,
          1683,    8, 1643, 2344,  122,  234,   47,  177,  180,  106,  592,
            

# Treinando em uma Amostra
Isso ajuda a ver se o modelo está convergindo

In [96]:
model.train()
ITERACOES = 1_000_000
iterator = tqdm(range(ITERACOES))
X = batch_tokens[0, :].unsqueeze(0).to(device)
y = batch_labels[0].unsqueeze(0).to(device)
for _ in iterator:
    y_hat = model(X)
    loss = criterion(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    acc = acc_metric(y, y_hat)
    iterator.set_postfix({"loss": f"{loss.item():6.3f}", "accuracy": f"{acc:.3f}"})
    if loss.item() <= 0.5:
        break

  0%|                                             | 175/1000000 [00:01<2:07:01, 131.18it/s, loss=0.499, accuracy=1.000]


In [97]:
print(f'LABEL: {y.item()}')
print(f'PREDICT: {torch.argmax(y_hat)}')
print(f'PREDICT PROBA: {F.softmax(y_hat, dim=-1).tolist()}')

LABEL: 2
PREDICT: 2
PREDICT PROBA: [[0.059670496731996536, 0.07482034713029861, 0.6071296334266663, 0.15909965336322784, 0.09927991032600403]]


In [98]:
print(f'logit predict: {y_hat.tolist()} | true label: {y}')

logit predict: [[-0.5622339248657227, -0.3359818458557129, 1.7576706409454346, 0.4184591472148895, -0.05312836170196533]] | true label: tensor([2], device='cuda:0')


# Treinando em um Batch

In [99]:
model.train()
ITERACOES = 1_000_000
iterator = tqdm(range(ITERACOES))
X = batch_tokens.to(device)
y = batch_labels.to(device)
for _ in iterator:
    y_hat = model(X)
    loss = criterion(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    acc = acc_metric(y, y_hat)
    iterator.set_postfix({"loss": f"{loss.item():6.3f}", "accuracy": f"{acc:.3f}"})
    if loss.item() <= 0.1:
        break

  0%|                                             | 1727/1000000 [00:44<7:12:14, 38.49it/s, loss=0.100, accuracy=1.000]


In [100]:
torch.argmax(y_hat, dim=-1)

tensor([2, 2, 3, 0, 0, 2, 4, 3, 0, 3, 2, 4, 3, 3, 3, 2, 2, 2, 3, 1, 4, 3, 3, 3,
        1, 0, 3, 0, 1, 2, 2, 3], device='cuda:0')

In [101]:
y

tensor([2, 2, 3, 0, 0, 2, 4, 3, 0, 3, 2, 4, 3, 3, 3, 2, 2, 2, 3, 1, 4, 3, 3, 3,
        1, 0, 3, 0, 1, 2, 2, 3], device='cuda:0')

# Treinando para o conjunto de dados inteiro

In [102]:
@torch.no_grad()
def eval_model(model, data_eval) -> None:
    model.eval()
    acc_list = []
    for i, batch in enumerate(data_eval):
        labels, tokens, texts = batch['label'], batch['tokens'], batch['text']
        labels, tokens = labels.to(device), tokens.to(device)

        predict = model(tokens)
        predict = torch.argmax(predict, dim=1)
        acc = torch.mean((predict == labels).float()).item()
        acc_list.append(acc)
    acc_tensor = torch.tensor(acc_list)
    return round(torch.mean(acc_tensor).item(), 3)


def train_model(model, data_train, data_eval, epochs, optimizer, criterion) -> None:
    model.train()
    optimizer.zero_grad(set_to_none=True)
    loss_eval_list: list = []
    acc_eval = 0.0
    for epoch in range(epochs):
        torch.cuda.empty_cache()
        loss_epoch = 0.0
        batch_iterator = tqdm(data_train, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            labels, tokens, texts = batch['label'], batch['tokens'], batch['text']
            labels, tokens = labels.to(device), tokens.to(device)
            optimizer.zero_grad()
            outputs = model(tokens)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            acc = acc_metric(labels, outputs)
            
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}", "accuracy train": f"{acc:.3f}", "accuracy eval": f"{acc_eval:.3f}"})
            
        acc_eval = eval_model(model, data_eval)
        loss_eval_list.append(acc_eval)
    return loss_eval_list

In [None]:
loss_eval_list = train_model(model, train_loader, test_loader, epochs=EPOCHS, optimizer=optimizer, criterion=criterion)

Processing Epoch 04: 100%|█| 20313/20313 [12:47<00:00, 26.48it/s, loss=0.956, accuracy train=0.562, accuracy eval=0.559
Processing Epoch 07: 100%|█| 20313/20313 [12:46<00:00, 26.51it/s, loss=1.127, accuracy train=0.500, accuracy eval=0.572
Processing Epoch 10: 100%|█| 20313/20313 [12:47<00:00, 26.48it/s, loss=1.059, accuracy train=0.562, accuracy eval=0.576
Processing Epoch 15: 100%|█| 20313/20313 [2:24:55<00:00,  2.34it/s, loss=0.904, accuracy train=0.500, accuracy eval=0.5
Processing Epoch 16: 100%|█| 20313/20313 [13:11<00:00, 25.66it/s, loss=0.812, accuracy train=0.750, accuracy eval=0.583
Processing Epoch 18: 100%|█| 20313/20313 [13:22<00:00, 25.32it/s, loss=1.311, accuracy train=0.438, accuracy eval=0.585
Processing Epoch 19:  81%|▊| 16408/20313 [10:56<02:35, 25.05it/s, loss=0.916, accuracy train=0.625, accuracy eval=0.583

In [104]:
# Salvando o modelo
file_path = 'models/yepreview_model_.pth'
torch.save(model.state_dict(), file_path)

In [105]:
batch = next(iter(test_loader))
batch_labels, batch_tokens, batch_texts = batch['label'], batch['tokens'], batch['text']

In [109]:
pred = model(batch_tokens.to(device))

In [110]:
batch_labels

tensor([0, 4, 3, 3, 2, 0, 0, 3, 2, 4, 1, 4, 2, 0, 0, 3, 3, 0, 1, 1, 0, 2, 3, 1,
        0, 1, 3, 1, 2, 4, 0, 3])

In [111]:
torch.argmax(pred, dim=-1)

tensor([0, 4, 3, 3, 4, 0, 2, 4, 2, 4, 2, 4, 1, 0, 0, 3, 0, 0, 1, 1, 3, 2, 4, 2,
        1, 1, 3, 1, 2, 4, 0, 3], device='cuda:0')

In [114]:
acc_metric(batch_labels.to(device), pred)

0.6875