## Criação de um modelo que avaliará um texto se é positivo, neutro ou negativo

In [None]:
!pip install -qq transformers

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW

Utlização do BERT, modelo de representação de linguagem desenvolvido por pesquisadores do Google AI

In [None]:
PRE_TRAINED_MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7bf5591fc450>

Criação de funções que vão fazer a tokenização dos textos

In [None]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


importando dados que treinarão o modelo

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/reviews.csv")

Divisão dos dados em dados de teste e de treino

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df.content.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 16
MAX_LEN = 210

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)



Criação de uma classe para fazer a classificação em Positivo, Negativo ou Neutro

In [None]:
import torch.nn as nn

class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    #The last_hidden_state is a sequence of hidden states of the last layer of the model
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
class_names = ['negative', 'neutral', 'positive']
device = torch.device("cuda")

model = SentimentClassifier(len(class_names))
model = model.to(device)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Treinando e testando o Modelo

In [None]:
EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/10
----------
Train loss 0.761186646145803 accuracy 0.6520833333333333
Val   loss 0.6798300926884016 accuracy 0.703125

Epoch 2/10
----------
Train loss 0.5777263700755106 accuracy 0.7613425925925926
Val   loss 0.7426426872611046 accuracy 0.6979166666666666

Epoch 3/10
----------
Train loss 0.3974250577873102 accuracy 0.8564236111111111
Val   loss 0.846735502841572 accuracy 0.7083333333333334

Epoch 4/10
----------
Train loss 0.2898157630633149 accuracy 0.905613425925926
Val   loss 1.0440448999094467 accuracy 0.7104166666666667

Epoch 5/10
----------
Train loss 0.22247669365692505 accuracy 0.9371527777777778
Val   loss 1.2270021175344785 accuracy 0.7104166666666667

Epoch 6/10
----------
Train loss 0.168484077099452 accuracy 0.9545717592592593
Val   loss 1.456731877864028 accuracy 0.7333333333333333

Epoch 7/10
----------
Train loss 0.12647533646323372 accuracy 0.968113425925926
Val   loss 1.6100307789398358 accuracy 0.7322916666666667

Epoch 8/10
----------
Train loss 0.089057

Salvando o modelo caso caia a sessão no Google Colab

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/sentiment_model_weights.pth')

## Testando o modelo

In [None]:
tests = ["Eu gostaria de fazer uma avaliação sincera aqui e ver se o modelo vai identificar. Mas esse é só um teste, \
          então nao consigo dizer se gosto ou nao gosto. Pode ser bom ou pode ser ruim :):(:",

        "Esse aplicativo é muito louco, bem complicado de mexer",

         "Na moral, tá pra nascer app melhor que o Zé delivery!. Bebida sempre gelada e em minutos.",

         "Esse negócio é bem ruim!",

         "Odiei!"
         ]

In [None]:
import torch.nn.functional as F

for test in tests:
  encoded_review = tokenizer.encode_plus(
    test,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
  )

  input_ids = encoded_review['input_ids'].to(device)
  attention_mask = encoded_review['attention_mask'].to(device)

  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  probs = F.softmax(output, dim=1)

  print(f'Review text: {test}')
  print(pd.DataFrame(probs.tolist()[0], class_names)[0])
  print("========================\n")

Review text: Eu gostaria de fazer uma avaliação sincera aqui e ver se o modelo vai identificar. Mas esse é só um teste,           então nao consigo dizer se gosto ou nao gosto. Pode ser bom ou pode ser ruim :):(:
negative    0.000224
neutral     0.000622
positive    0.999154
Name: 0, dtype: float64

Review text: Esse aplicativo é muito louco, bem complicado de mexer
negative    0.000340
neutral     0.999542
positive    0.000118
Name: 0, dtype: float64

Review text: Na moral, tá pra nascer app melhor que o Zé delivery!. Bebida sempre gelada e em minutos.
negative    0.000098
neutral     0.000045
positive    0.999857
Name: 0, dtype: float64

Review text: Esse negócio é bem ruim!
negative    0.000480
neutral     0.999272
positive    0.000248
Name: 0, dtype: float64

Review text: Odiei!
negative    0.736540
neutral     0.005914
positive    0.257546
Name: 0, dtype: float64



## Usando o modelo no dataset do olist

In [None]:
main_df = pd.read_csv("/content/drive/MyDrive/olist_order_reviews_dataset.csv")

Retorna uma lista de rótulos ('negative', 'positive') para uma lista de textos.
Neutral será considerado negativo.

In [None]:
def predict_sentiment_batch(texts, model, tokenizer, device, max_len=210, batch_size=64):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded = tokenizer(batch_texts,
                                max_length=max_len,
                                padding='max_length',
                                truncation=True,
                                return_tensors='pt')
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask)
            probs = F.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1).cpu().numpy()

            # Considera neutral como negativo
            preds = np.where(preds == 2, 'positive', 'negative')
            all_preds.extend(preds)

    return all_preds

In [None]:
main_df['sentiment'] = None

Utilizando batches para acelerar o processo de avaliação usando o modelo

In [None]:
import math

def predict_sentiment_in_batches(texts, model, tokenizer, device, batch_size=64):
    all_predictions = []
    n_batches = math.ceil(len(texts) / batch_size)

    for i in range(n_batches):
        batch_texts = texts[i*batch_size : (i+1)*batch_size]
        batch_preds = predict_sentiment_batch(batch_texts, model, tokenizer, device)
        all_predictions.extend(batch_preds)

    return all_predictions

Método de Avaliação final:
1. Avalia a coluna 'review_comment_message', caso seja nulo vai pro próximo passo
2. Avalia a coluna 'review_comment_title', caso seja nulo vai pro próximo passo
3. Avalia a coluna 'review_score', sendo que se for < 3 é negativo, caso contrário será positivo

In [None]:
mask_msg = main_df['review_comment_message'].notna() & main_df['review_comment_message'].str.strip().ne("")
main_df.loc[mask_msg, 'sentiment'] = predict_sentiment_in_batches(
    main_df.loc[mask_msg, 'review_comment_message'].tolist(),
    model, tokenizer, device
)

mask_title = main_df['sentiment'].isna() & main_df['review_comment_title'].notna() & main_df['review_comment_title'].str.strip().ne("")
main_df.loc[mask_title, 'sentiment'] = predict_sentiment_in_batches(
    main_df.loc[mask_title, 'review_comment_title'].tolist(),
    model, tokenizer, device
)

mask_score = main_df['sentiment'].isna() & main_df['review_score'].notna()
main_df.loc[mask_score, 'sentiment'] = main_df.loc[mask_score, 'review_score'].apply(
    lambda x: 'negative' if x < 3 else 'positive'
)

In [None]:
main_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,sentiment
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,positive
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13,positive
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24,positive
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,positive
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,positive


## Salvando a tabela em um arquivo csv

In [None]:
main_df.to_csv('olist_order_reviews_avaliado.csv', index=False, encoding='utf-8-sig')

Alguns ajustes foram feitos no PoweBI