In [1]:
from pathlib import Path
from enum import Enum
from tqdm import tqdm
import math

In [2]:
from datasets import load_dataset

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset 

In [4]:
from torchinfo import summary

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda')

In [7]:
dataset = load_dataset("yelp_review_full")

# Acessar os dados
train_data = dataset["train"]
test_data = dataset["test"]

# Exibir informações sobre o conjunto de dados
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


# Configs

In [8]:
class SpecialTokens(Enum):
    CLS = 2
    PAD = 0
    UNK = 1

special_tokens = SpecialTokens

In [9]:
SEQ_LEN = 124
D_MODEL = 16
N_HEADS = 4
Nx = 2
N_OUTPUT = 5
VOCAB_SIZE = 15_000
LR = 1e-5
BATCH_SIZE_TRAIN = 2
BATCH_SIZE_TEST = 32
EPOCHS = 15

# Tokenizer

In [10]:
def text_iterator(data):
    for text in data['text']:
        yield text.lower()

In [11]:
def word_level_tokenizer(data, path_tokenizer: Path = Path('tokenizer/tokenizer.json')):
    if not Path.exists(path_tokenizer):
        tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
        tokenizer.pre_tokenizer = Whitespace()

        # Definindo o PAD token como o primeiro na lista de tokens especiais
        special_tokens = ["<PAD>", "<UNK>", "<CLS>"]

        trainer = BpeTrainer(special_tokens=special_tokens, min_frequency=5, vocab_size=VOCAB_SIZE)
        tokenizer.train_from_iterator(text_iterator(data), trainer=trainer)
        tokenizer.save(str(path_tokenizer))
    else:
        tokenizer = Tokenizer.from_file(str(path_tokenizer))

    return tokenizer

In [12]:
tokenizer = word_level_tokenizer(train_data)

In [13]:
print(f'VOCAB SIZE: {tokenizer.get_vocab_size()}')

VOCAB SIZE: 15000


In [14]:
tokenizer.encode('<PAD>').ids

[0]

# Dataset

In [15]:
tokenizer.encode('<CLS>').ids

[2]

In [16]:
from typing import Tuple, List

class YelpReviewFullDataset(Dataset):
    def __init__(self, data, tokenizer: Tokenizer = tokenizer, seq_len: int = SEQ_LEN) -> None:
        self.data = data
        self.tokenizer = tokenizer
        self.CLS_token_id = 2
        self.PAD_token_id = 0
        self.seq_len = seq_len

    def __len__(self) -> int:
        return self.data.num_rows
        
    def __getitem__(self, id_i) -> Tuple[int, List[int]]:
        item = self.data[id_i]
        label, text = item['label'], item['text'].lower()
        tokens_list = [self.CLS_token_id] + self.tokenizer.encode(text).ids
        tokens_list = self.truncate_seq(tokens_list)
        return {'label': label, 'tokens': torch.tensor(tokens_list), 'text': text}

    def truncate_seq(self, tokens_list: List[int]):
        len_token_list = len(tokens_list)
        if len_token_list > self.seq_len:
            return tokens_list[: self.seq_len]
        elif len_token_list < self.seq_len:
            return tokens_list + [self.PAD_token_id] * (self.seq_len - len_token_list)
        return tokens_list

In [17]:
train_dataset = YelpReviewFullDataset(train_data)
test_dataset = YelpReviewFullDataset(test_data)

In [18]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE_TRAIN)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE_TEST)

# Model

In [19]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [20]:
class YepReviewModel(nn.Module):
    def __init__(
        self,
        seq_len: int = SEQ_LEN,
        d_model: int = D_MODEL,
        vocab_size: int = VOCAB_SIZE,
        num_heads: int = N_HEADS,
        n_x: int = Nx,
        dropout: float = 0.05,
        n_outputs: int = N_OUTPUT
    ):
        super().__init__()
        # configurações do modelo
        self.seq_len = seq_len
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.n_x = n_x
        self.dropout = dropout
        self.n_outputs = n_outputs

        # componentes
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size, 
            embedding_dim=self.d_model, 
            padding_idx=SpecialTokens.PAD.value)
        self.pos_encodding = PositionalEncoding(self.d_model, self.seq_len, self.dropout)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model, nhead=self.num_heads)
        self.encoder_block = nn.TransformerEncoder(self.encoder_layer, num_layers=self.n_x, )
        self.linear_layer = nn.Sequential(
            nn.Linear(self.d_model, 512),
            nn.ReLU(),
            nn.Linear(512, 124),
            nn.ReLU(),
            nn.Linear(124, 32),
            nn.ReLU(),
            nn.Linear(32, 10),
            nn.ReLU(),
            nn.Linear(10, 8)
        )
        self.output_layer = nn.Linear(8, self.n_outputs)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.pos_encodding(x)
        x = self.encoder_layer(x)
        # Pegando a representação vetorial do token <CLS>
        x = x[:, 0, :]
        x = self.linear_layer(x)
        x = self.output_layer(x)
        return x

In [21]:
model = YepReviewModel().to(device)

In [22]:
summary(model)

Layer (type:depth-idx)                                            Param #
YepReviewModel                                                    --
├─Embedding: 1-1                                                  240,000
├─PositionalEncoding: 1-2                                         --
│    └─Dropout: 2-1                                               --
├─TransformerEncoderLayer: 1-3                                    --
│    └─MultiheadAttention: 2-2                                    816
│    │    └─NonDynamicallyQuantizableLinear: 3-1                  272
│    └─Linear: 2-3                                                34,816
│    └─Dropout: 2-4                                               --
│    └─Linear: 2-5                                                32,784
│    └─LayerNorm: 2-6                                             32
│    └─LayerNorm: 2-7                                             32
│    └─Dropout: 2-8                                               --
│    └─Dropout

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [24]:
def acc_metric(label, output):
    output = torch.argmax(output, dim=1)
    return (label == output).float().mean().detach().item()

In [25]:
next(iter(train_loader))

{'label': tensor([3, 2]),
 'tokens': tensor([[    2,    70,   496,    93,   244,    14,   402,   230,  1319,   136,
           1244,    96,  1000,    16,    70,   298,  3106,   262,    82,   119,
            159,   468,    86,   443,   569,   117,  1484,   879,   616,    16,
             80,    70,   241,   893,    96,    82,   145,    16,    47,  1191,
            180,    93,   293,    79,  2534,   128,   336,    39,  6540,   833,
             16,    70,  2695,   159,   621,    14,  1096,    70,  2897,    16,
           1082,   123,   135,   159,   668,    16,   304,    70,  1604,  3842,
             93,   621,    16,   135,   151,    39,  1341,  1775,  1922,    16,
             70,   446,   545,    93,   274,   179,    14,   110,    93,    70,
           2080,    16,  1177,   564,    70,   402,   230,   146,   753,  1999,
             70,   311,   106,    70,   429,   346,   616,    16,  1115,    70,
            242,    93,   244,    14,    70,  1179,  1453,    93,   274,   782,
    

In [26]:
batch = next(iter(train_loader))
batch_labels, batch_tokens, batch_texts = batch['label'], batch['tokens'], batch['text']

In [27]:
print('#### TEXTO ####')
print(batch_texts[0])
print(f'## Label: {batch_labels[0]} ##')

#### TEXTO ####
eh cuz, i like make da kine public service announcement first for all da buggahs stay writin' reviews for anykine hawaiian place. no such thing as kahlua pig. we no put da kine coffee liqueur and shit on da pig, bra. also, no moa kailua pig either, bra. you buy pig or pork in kailua and bring em' back to honolulu and you goin' be takin' pig/pork on da kine, pali highway. first thing goin' happen is yo' car goin' stall right on da highway, bra. next da night marchers goin' carjack your car. then madam pele and da limu lady - two of da baddest tittas on da island goin' take you to morgan's corner and bang yo' head on the 13 steps, bra. by da time they done with you, you goin' forget about evah callin' it kailua pig. it's kalua, bra. ok bruddah. enough of the public service announcements. dis place stay in one mall next to one barnes and nobles. i went come here plenty times. one pake family stay run this place and they do one pretty good job of making island kine plate lu

In [28]:
batch_tokens[0, :]

tensor([    2,  3921,  4511,    14,    47,   199,   460,  1384,    49,   323,
         3498,   242, 11257,   374,   402,   106,   145,  1384,   108,   638,
         1194,    57,   582,   576,    82,    71,     9,   964,   106,   214,
           49,   323,  4131,   183,    16,   182,  1041,   188,    76, 13377,
          324,    39,  4226,    16,    94,   182,   715,  1384,    49,   323,
          862, 13859,   220,    80,  2607,    78,  1384,  4226,    14,  3859,
           16,   338,    14,   182,   154,    39,    49,  1057,  8904,  4226,
         1027,    14,  3859,    16,   117,  1088,  4226,    83,  1051,    71,
           49,  1057,  8904,    80,  1048,   834,     9,   257,    79,  3260,
         3736,  9318,    80,   117,   177,    71,     9,    99,   270,  2924,
            9,  4226,    17,  1051,    78,  1384,    49,   323,    14,  1593,
           47,  8700,    16,   402,   188,   177,    71,     9,  1134,    86,
         1714,     9,   457,   177])

In [29]:
batch_labels[0].unsqueeze(0)

tensor([3])

# Treinando em uma Amostra
Isso ajuda a ver se o modelo está convergindo

In [30]:
model.train()
ITERACOES = 1_000_000
iterator = tqdm(range(ITERACOES))
X = batch_tokens[0, :].unsqueeze(0).to(device)
y = batch_labels[0].unsqueeze(0).to(device)
for _ in iterator:
    y_hat = model(X)
    loss = criterion(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    acc = acc_metric(y, y_hat)
    iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
    if loss.item() <= 0.2:
        break

  0%|                                                             | 1189/1000000 [00:26<6:08:01, 45.23it/s, loss=0.197]


In [50]:
print(f'LABEL: {y.item()}')
print(f'PREDICT: {torch.argmax(y_hat)}')
print(f'PREDICT PROBA: {F.softmax(y_hat, dim=-1).tolist()}')

LABEL: 3
PREDICT: 3
PREDICT PROBA: [[0.07483470439910889, 0.0650244951248169, 0.006782560609281063, 0.8213327527046204, 0.0320255346596241]]


# Treinando em um Batch

In [None]:
model.train()
ITERACOES = 1_000_000
iterator = tqdm(range(ITERACOES))
X = batch_tokens.to(device)
y = batch_labels.to(device)
for _ in iterator:
    y_hat = model(X)
    loss = criterion(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    acc = acc_metric(y, y_hat)
    iterator.set_postfix({"loss": f"{loss.item():6.3f}", "accuracy": f"{acc:.3f}"})
    if loss.item() <= 0.35:
        break

  4%|█▋                                         | 40080/1000000 [06:15<2:16:01, 117.62it/s, loss=0.697, accuracy=0.000]

# Treinando para o conjunto de dados inteiro

In [39]:
@torch.no_grad()
def eval_model(model, data_eval) -> None:
    model.eval()
    acc_list = []
    for i, batch in enumerate(data_eval):
        labels, tokens, texts = batch['label'], batch['tokens'], batch['text']
        tokens = torch.stack(tokens, dim=1)
        labels, tokens = labels.to(device), tokens.to(device)

        predict = model(tokens)
        predict = torch.argmax(predict, dim=1)
        acc = torch.mean((predict == labels).float()).item()
        acc_list.append(acc)
    acc_tensor = torch.tensor(acc_list)
    return round(torch.mean(acc_tensor).item(), 3)


def train_model(model, data_train, data_eval, epochs, optimizer, criterion) -> None:
    model.train()
    optimizer.zero_grad(set_to_none=True)
    loss_eval_list: list = []
    for epoch in range(epochs):
        torch.cuda.empty_cache()
        loss_epoch = 0.0
        batch_iterator = tqdm(data_train, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            labels, tokens, texts = batch['label'], batch['tokens'], batch['text']
            tokens = torch.stack(tokens, dim=1)
            labels, tokens = labels.to(device), tokens.to(device)
            optimizer.zero_grad()
            outputs = model(tokens)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            
        acc_eval = eval_model(model, data_eval)
        loss_eval_list.append(acc_eval)
    return loss_eval_list

In [None]:
train_model(model, train_loader, test_loader, epochs=EPOCHS, optimizer=optimizer, criterion=criterion)

Processing Epoch 00: 100%|███████████████████████████████████████████| 20313/20313 [11:30<00:00, 29.41it/s, loss=1.615]
Processing Epoch 01: 100%|███████████████████████████████████████████| 20313/20313 [10:50<00:00, 31.24it/s, loss=1.610]
Processing Epoch 02: 100%|███████████████████████████████████████████| 20313/20313 [10:54<00:00, 31.05it/s, loss=1.610]
Processing Epoch 03:  23%|█████████▉                                  | 4578/20313 [02:27<08:29, 30.90it/s, loss=1.612]

In [189]:
file_path = 'models/model_v0.pth'

# Salvando o modelo
torch.save(model.state_dict(), file_path)