In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math

In [4]:
from datasets import load_dataset
from tqdm import tqdm

In [5]:
from transformers import AutoTokenizer, BertTokenizer

In [6]:
from torch.utils.data import Dataset, DataLoader

In [7]:
from torch import Tensor

In [8]:
import numpy as np
import torch.nn as nn

In [9]:
import torch
from torchinfo import summary

In [10]:
# download dataset
dataset = load_dataset("AiresPucrs/google-play-apps-review-pt")

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
device

'cpu'

# Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained("pierreguillou/bert-base-cased-squad-v1.1-portuguese")



In [14]:
texto = 'Esse texto que será tokenizado.'

In [15]:
tokens = tokenizer.tokenize(texto)
print(f'TOKENS BERT BASE PT-BT: {tokens}')

TOKENS BERT BASE PT-BT: ['Esse', 'texto', 'que', 'será', 'to', '##ken', '##izado', '.']


In [16]:
VOCAB_SIZE = tokenizer.vocab_size
VOCAB_SIZE

29794

In [17]:
token_ids = tokenizer.encode(texto, add_special_tokens=True)
print(f'TOKENS IDS: {token_ids}')

TOKENS IDS: [101, 3758, 4054, 179, 2810, 374, 8110, 2303, 119, 102]


# Adicionando normalizer ao tokenizer

In [18]:
from tokenizers.normalizers import NFKC, Lowercase, StripAccents, Sequence

In [19]:
custom_normalizer = Sequence([NFKC(), Lowercase(), StripAccents()])

In [20]:
tokenizer.backend_tokenizer.normalizer = custom_normalizer

In [21]:
text = "Isso é um TESTE com ACENTOS e caracteres especiais como: ç, ã, é!"
encoded = tokenizer(text)
print(f'TOKENS BERT BASE PT-BT: {tokens}')
print(tokenizer.convert_ids_to_tokens(encoded["input_ids"]))

TOKENS BERT BASE PT-BT: ['Esse', 'texto', 'que', 'será', 'to', '##ken', '##izado', '.']
['[CLS]', 'isso', 'é', 'um', 'teste', 'com', 'ace', '##ntos', 'e', 'caracteres', 'especiais', 'como', ':', '[UNK]', ',', '[UNK]', ',', 'é', '!', '[SEP]']


In [22]:
encoded = tokenizer(text, padding='max_length', max_length=124, truncation=True)
print(encoded['input_ids'])

[101, 1257, 253, 222, 3515, 170, 10049, 850, 122, 12962, 4797, 271, 131, 100, 117, 100, 117, 253, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# MODEL CONFIG

In [23]:
1e-4

0.0001

In [24]:
VOCAB_SIZE = tokenizer.vocab_size
SEQ_LEN = 256
D_MODEL = 64
N_LAYERS = 6
N_HEADS = 4
N_OUTPUT = 1
HIDDEN_SIZE = 512
DROPOUT = 0.1
BATCH_SIZE = 32
EPOCHS = 50
DROPOUT = 0.10
LR = 1e-3

# Dataset

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 20000
    })
})

In [26]:
# Calculating the average of texts
mean_words_text = np.mean(
    list(map(lambda x: len(x.split()), dataset["train"]['review']))
)

std_words_text = np.std(
    list(map(lambda x: len(x.split()), dataset["train"]['review']))
)
print(f'MEAN WORDS BY TEXT: {mean_words_text}')
print(f'STD WORDS BY TEXT: {std_words_text}')

MEAN WORDS BY TEXT: 63.9267
STD WORDS BY TEXT: 16.53660264715821


In [27]:
dataset['train'][11_000]['review']

'voces precisam cobrar dos entregadores que se comuniquem nao faz sentido o restaurante colocar o pedido como finalizado o entregador estar la e demorar mais de 30minutos para sair alguem passou a informacao errada ai vc pede esclarecimentos ao entregador via chat ele nao responde liga ele nao atende ai na entrega chega com cara de banda falando que o restaurante atrasou se foi isso pq nao respondeu quando foi questionado isso e muito frequente alias raridade e quando nao acontece'

In [28]:
dataset['train'][11_000]['sentiment']

0

In [29]:
print('NUM TOKENS WITHOUT TRUNCATION:', len(tokenizer(dataset['train'][0]['review'])['input_ids']))
print('NUM TOKENS WITH TRUCATION', len(
    tokenizer(
        dataset['train'][0]['review'],
        padding='max_length',
        max_length=124,
        truncation=True)
    ['input_ids'])
)

NUM TOKENS WITHOUT TRUNCATION: 236
NUM TOKENS WITH TRUCATION 124


In [30]:
dataset['train'] = dataset['train'].shuffle(seed=42)

In [31]:
torch.tensor(dataset['train'][0]['sentiment']).view(1, 1).to(torch.float32)

tensor([[0.]])

In [32]:
class GooglePlayAppsReviewClassifier(Dataset):
    def __init__(
        self,
        dataset,
        tokenizer = tokenizer,
        seq_len: int = SEQ_LEN,
        vocab_size: int = VOCAB_SIZE
    ):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.vocab_size = vocab_size

    def __len__(self):
        return self.dataset['train'].num_rows

    def __getitem__(self, index):
        text = dataset['train'][index]['review']
        X_token = self.tokenizer(
            text,
            padding='max_length',
            max_length=self.seq_len,
            truncation=True
        )['input_ids']
        y = torch.tensor(dataset['train'][index]['sentiment'])
        y = y.view(1)
        return {
            'text': text,
            'tokens': torch.tensor(X_token),
            'label': y.to(torch.float32)
        }        

In [33]:
train_dataset = GooglePlayAppsReviewClassifier(dataset)

In [34]:
train_dataset[5]

{'text': 'parece que a cada atualizacao a frequencia dos anuncios aumentam e invadem cada vez mais os videos horrivel isso ha poucos anos atras nao ocorria essas publicidades invasivas e desnecessarias desanima qualquer um que esta na plataforma apreciando conteudo principalmente pelo smartphone e tira o foco total do video execucao e opcoes de qualidade de video interessantes mas deviam rever essa politica de publicidade pq ta invasiva demais',
 'tokens': tensor([  101,  4048,   179,   123,  1078,  2233,  2446,   304, 22280,   123,
          1864,  3292,   298,  2043,   128, 20958,   122,  5808,   210,  1078,
           576,   325,   259, 12456, 22281,  3428, 15558, 22290,  1257,   607,
          3885,   481,  7521,   229, 22280,  3719,   151,  3867, 10834, 22281,
          3819,  4521,   122, 15310,  1347, 14348,  1950,  4029,   148,  1569,
           222,   179,   418,   229,  6326,  2533,   351,   214,  1519,  1350,
          1953,   423,   139, 14254, 19894,   514,   122,  4551,  

In [35]:
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=BATCH_SIZE)

# MODEL

In [36]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int = D_MODEL, dropout: float = 0.01, seq_len: int = SEQ_LEN):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(SEQ_LEN, D_MODEL)
        k = torch.arange(0, seq_len).unsqueeze(1) 
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10_000) / d_model))
        # sine for even indices
        pe[:, 0::2] = torch.sin(k * div_term)
        # cos for odd indices
        pe[:, 1::2] = torch.cos(k * div_term)
        # add batch dim
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x: Tensor):
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [37]:
class GooglePlayAppsReviewModel(nn.Module):
    def __init__(
        self,
        seq_len: int = SEQ_LEN,
        d_model: int = D_MODEL,
        vocab_size: int = VOCAB_SIZE,
        num_heads: int = N_HEADS,
        nx: int = N_LAYERS,
        n_outputs: int = N_OUTPUT,
        dropout: float = DROPOUT,
    ):
        super().__init__()
        self.seq_len = seq_len
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.nx = nx
        self.dropout = dropout
        self.n_outputs = n_outputs

        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.d_model,
            padding_idx=0
        )

        self.positional_encoding = PositionalEncoding(
            d_model=self.d_model,
            dropout=self.dropout,
            seq_len=self.seq_len
        )

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.d_model,
            nhead=self.num_heads,
            dropout=self.dropout,
            norm_first=True,
            batch_first=True,
            activation='gelu'
        )
        self.encoder_block = nn.TransformerEncoder(
            self.encoder_layer,
            num_layers=self.nx
        )

        self.linear = nn.Sequential(
            nn.Linear(self.d_model, self.d_model * 2),
            nn.GELU(),
            nn.Linear(self.d_model * 2, self.d_model),
            nn.GELU(),
            nn.Linear(self.d_model, self.d_model),
            nn.GELU(),
            nn.Linear(self.d_model, self.d_model // 4),
            nn.GELU(),
            nn.Linear(self.d_model // 4, self.d_model // 8),
            nn.GELU(),
            nn.Linear(self.d_model // 8, self.d_model // 16)
        )

        self.output = nn.Sequential(
            nn.Linear(self.d_model // 16, self.d_model // 32),
            nn.GELU(),
            nn.Linear(self.d_model // 32, self.d_model // self.d_model),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.positional_encoding(x)
        x = self.encoder_block(x)
        # Pegando a representação vetorial do token <CLS>
        x = x[:, 0, :]

        x = self.linear(x)
        x = self.output(x)
        return x

In [52]:
model = GooglePlayAppsReviewModel().to(device)

In [53]:
summary(model)

Layer (type:depth-idx)                                            Param #
GooglePlayAppsReviewModel                                         --
├─Embedding: 1-1                                                  1,906,816
├─PositionalEncoding: 1-2                                         --
│    └─Dropout: 2-1                                               --
├─TransformerEncoderLayer: 1-3                                    --
│    └─MultiheadAttention: 2-2                                    12,480
│    │    └─NonDynamicallyQuantizableLinear: 3-1                  4,160
│    └─Linear: 2-3                                                133,120
│    └─Dropout: 2-4                                               --
│    └─Linear: 2-5                                                131,136
│    └─LayerNorm: 2-6                                             128
│    └─LayerNorm: 2-7                                             128
│    └─Dropout: 2-8                                               --
│  

In [54]:
batch = next(iter(train_loader))
batch_labels, batch_tokens, batch_texts = batch['label'], batch['tokens'], batch['text']

In [55]:
batch_tokens

tensor([[  101,   146,   305,  ...,     0,     0,     0],
        [  101,   146,   305,  ...,     0,     0,     0],
        [  101, 12044,   311,  ...,     0,     0,     0],
        ...,
        [  101, 20933,   678,  ...,     0,     0,     0],
        [  101,   146,   305,  ...,     0,     0,     0],
        [  101,  8766,   185,  ...,     0,     0,     0]])

In [56]:
batch_tokens[1, :]

tensor([  101,   146,   305, 22291,   495,  4062,  2535,   418,  7264,  1415,
         2394,   368, 17105,   348,   123,  7179,   202,  7343,   237, 22336,
         9570, 22281,   700,   180,   169,  8388,  2233,  2446,   304, 22280,
          229, 22280,  7674, 18691,   926,  8977,   538,  4395,   553,   122,
          229, 22280, 21174,  1941,  1114,   244,   125,   925,  2421, 22281,
         1176,   240,  1966,  3350,  1941,   418,   229,  5314,   125,  8416,
         1257,   229, 22280,  7093, 22287,   625, 13956, 19508,  5698,   244,
         7122, 11175,   304, 22280, 12044,  1004, 17497, 12966,   170, 14730,
         2394,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [57]:
batch_labels[1].shape

torch.Size([1])

In [58]:
X = batch_tokens[1, :].unsqueeze(0).to(device)
y = batch_labels[1].to(device)

In [59]:
model(X).shape

torch.Size([1, 1])

In [60]:
y.shape

torch.Size([1])

In [61]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.BCELoss()

In [64]:
criterion(model(X), y.unsqueeze(0))

tensor(0.5114, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)

In [66]:
model.train()
ITERACOES = 1_000_000
iterator = tqdm(range(ITERACOES))
for _ in iterator:
    y_hat = model(X)
    
    loss = criterion(y_hat, y.unsqueeze(0))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
    if loss.item() <= 0.01:
        break

  0%|          | 21/1000000 [00:00<5:58:32, 46.48it/s, loss=0.004]


# Trienando em um batch

In [67]:
X = batch_tokens
y = batch_labels

In [72]:
X, y = X.to(device), y.to(device)

In [73]:
model.train()
ITERACOES = 1_000_000
iterator = tqdm(range(ITERACOES))
for _ in iterator:
    y_hat = model(X)
    loss = criterion(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
    if loss.item() <= 0.1:
        break

  0%|          | 177/1000000 [00:11<18:09:49, 15.29it/s, loss=0.096]


In [75]:
torch.where(y_hat > 0.5, torch.tensor(1.0), torch.tensor(0.0)).squeeze()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1.],
       device='cuda:0')

In [76]:
y.squeeze()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1.],
       device='cuda:0')

In [80]:
model.train()
optimizer.zero_grad(set_to_none=True)
loss_eval_list: list = []
for epoch in range(EPOCHS):
    torch.cuda.empty_cache()
    loss_epoch = 0.0
    batch_iterator = tqdm(train_loader, desc=f"Processing Epoch {epoch:02d}")
    for batch in batch_iterator:
        labels, tokens, texts = batch['label'], batch['tokens'], batch['text']
        labels, tokens = labels.to(device), tokens.to(device)
        optimizer.zero_grad()
        outputs = model(tokens)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

Processing Epoch 00: 100%|██████████| 625/625 [00:52<00:00, 11.87it/s, loss=0.501]
Processing Epoch 01: 100%|██████████| 625/625 [00:52<00:00, 11.84it/s, loss=0.445]
Processing Epoch 02: 100%|██████████| 625/625 [00:52<00:00, 11.86it/s, loss=0.388]
Processing Epoch 03: 100%|██████████| 625/625 [00:52<00:00, 11.84it/s, loss=0.435]
Processing Epoch 04: 100%|██████████| 625/625 [00:52<00:00, 11.84it/s, loss=0.486]
Processing Epoch 05: 100%|██████████| 625/625 [00:52<00:00, 11.85it/s, loss=0.405]
Processing Epoch 06: 100%|██████████| 625/625 [00:52<00:00, 11.84it/s, loss=0.412]
Processing Epoch 07: 100%|██████████| 625/625 [00:52<00:00, 11.85it/s, loss=0.451]
Processing Epoch 08: 100%|██████████| 625/625 [00:52<00:00, 11.85it/s, loss=0.320]
Processing Epoch 09: 100%|██████████| 625/625 [00:52<00:00, 11.85it/s, loss=0.352]
Processing Epoch 10: 100%|██████████| 625/625 [00:52<00:00, 11.87it/s, loss=0.338]
Processing Epoch 11: 100%|██████████| 625/625 [00:52<00:00, 11.85it/s, loss=0.301]
Proc

In [81]:
from pathlib import Path
model_file_path = Path('model/google_play_apps_review_model.pth')
torch.save(model.state_dict(), model_file_path)

In [83]:
torch.where(y_hat > 0.5, torch.tensor(1.0), torch.tensor(0.0)).squeeze()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1.],
       device='cuda:0')

In [84]:
y.squeeze()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1.],
       device='cuda:0')

In [38]:
model_inference = GooglePlayAppsReviewModel()



In [46]:
from pathlib import Path
model_file_path = Path('model/google_play_apps_review_model.pth')
model_inference.load_state_dict(torch.load(model_file_path, map_location=torch.device(device)))
model_inference.eval()  # Coloca o modelo em modo de avaliação

GooglePlayAppsReviewModel(
  (embedding_layer): Embedding(29794, 64, padding_idx=0)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (linear1): Linear(in_features=64, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=64, bias=True)
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder_block): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=6

In [48]:
negative_review = """
depois de um problema com um pedido que veio incorreto, o distribuidor culpou o app por nao disponibilizar os itens oferecidos, 
e por isso acabei recebendo menos do que paguei. apos essa resposta, fiz uma avaliacao negativa e, logo depois, recebi uma
ligacao do distribuidor, que supostamente seria para resolver o problema, mas acabou afirmando que nao gostou da avaliacao e que eu 
estava agindo de ma fe. acabei ficando no prejuizo e ainda fui tratado com hostilidade. por isso, nao recomendo o app. tambem tentei 
contato por email, mas a mensagem foi rejeitada.
"""

In [49]:
positive_review = """
depois de um pedido que veio corretamente, o distribuidor elogiou o app por fornecer todos os itens oferecidos, e por isso recebi 
exatamente o que paguei. apos essa resposta, fiz uma avaliacao positiva e, logo depois, recebi uma ligacao do distribuidor, que foi
para agradecer pela avaliacao e reforcar que tudo estava certo. fiquei satisfeito com o atendimento e fui tratado com muita cordialidade.
por isso, recomendo o app. tambem tentei contato por email, e a mensagem foi recebida sem problemas."""

In [50]:
text_tokenized = tokenizer(
            negative_review,
            padding='max_length',
            max_length=SEQ_LEN,
            truncation=True
        )['input_ids']

In [51]:
model_inference(torch.tensor(text_tokenized).unsqueeze(0))

tensor([[0.0023]], grad_fn=<SigmoidBackward0>)

In [52]:
text_tokenized = tokenizer(
            positive_review,
            padding='max_length',
            max_length=SEQ_LEN,
            truncation=True
        )['input_ids']

In [53]:
model_inference(torch.tensor(text_tokenized).unsqueeze(0))

tensor([[0.9999]], grad_fn=<SigmoidBackward0>)