In [1]:
!pip install transformers

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import numpy as np
import pandas as pd



# Configurações do modelo e tokenizador

---



In [2]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
NUM_LABELS = 2
BATCH_SIZE = 8
NUM_EPOCHS = 10
LEARNING_RATE = 5e-5

In [3]:
tokenizer_bert = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer_roberta = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

model_bert = AutoModel.from_pretrained('bert-base-multilingual-cased')
model_roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

# Funções Classifier e Treino

In [4]:
class Classifier(nn.Module):
    def __init__(self, model, num_labels):
        super(Classifier, self).__init__()
        self.base_model = model
        self.cls = nn.Linear(model.config.hidden_size, 400)
        self.dropout = nn.Dropout(p=0.5)
        self.cls2 = nn.Linear(400, num_labels)
        self.gelu = nn.GELU()

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        sequence_output = outputs[0][:, 0, :]  # CLS token output
        prediction = self.cls(sequence_output)
        prediction = self.gelu(prediction)
        prediction = self.dropout(prediction)
        prediction = self.cls2(prediction)
        return prediction

In [5]:
def train(model, train_loader, optimizer, loss_fct):
    model.train()
    epoch_losses = []
    for batch, labels in train_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        labels = labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = loss_fct(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    return np.mean(epoch_losses)

In [6]:
def evaluate(model, val_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch, labels in val_loader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs, dim=-1)
            y_true.extend(labels.tolist())
            y_pred.extend(predictions.cpu().tolist())
    return y_true, y_pred

#Dataset

In [7]:
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        label = torch.tensor(self.labels[idx])
        return item, label

    def __len__(self):
        return len(self.labels)

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import os
os.chdir('/content/drive/MyDrive/')

In [12]:
def load_dataset(csv_file, sample_size=None):
    data = pd.read_csv(csv_file)

    texts = data['text'].tolist()
    labels = data['label'].tolist()

    xtrain_global = np.array(texts)
    ytrain_global = np.array(labels)

    return xtrain_global, ytrain_global

#Treinar Modelos

In [13]:
    xtrain_global, ytrain_global = load_dataset("dataset.csv") #carregar dataset

    xtrain, xval, ytrain, yval = train_test_split(
        xtrain_global, ytrain_global, test_size=0.30, random_state=42, shuffle=True
    )

In [14]:
    #tokenizer bert
    train_encodings_bert = tokenizer_bert(xtrain.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
    val_encodings_bert = tokenizer_bert(xval.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

    #tokenizer roberta
    train_encodings_roberta = tokenizer_roberta(xtrain.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
    val_encodings_roberta = tokenizer_roberta(xval.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")


In [15]:
    #datasets e loader do bert
    train_dataset_bert = MyDataset(train_encodings_bert, ytrain)
    val_dataset_bert = MyDataset(val_encodings_bert, yval)
    train_loader_bert = DataLoader(train_dataset_bert, batch_size=BATCH_SIZE, shuffle=True)
    val_loader_bert = DataLoader(val_dataset_bert, batch_size=BATCH_SIZE)

    #datasets e loader da roberta
    train_dataset_roberta = MyDataset(train_encodings_roberta, ytrain)
    val_dataset_roberta = MyDataset(val_encodings_roberta, yval)
    train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=BATCH_SIZE, shuffle=True)
    val_loader_roberta = DataLoader(val_dataset_roberta, batch_size=BATCH_SIZE)

In [17]:
    model_bert = Classifier(model_bert, NUM_LABELS).to(DEVICE)
    model_roberta = Classifier(model_roberta, NUM_LABELS).to(DEVICE)

    optimizer_bert = AdamW(model_bert.parameters(), lr=LEARNING_RATE)
    optimizer_roberta = AdamW(model_roberta.parameters(), lr=LEARNING_RATE)

In [18]:
    loss_fct = nn.CrossEntropyLoss()

In [37]:
    #treino bert
    for epoch in range(NUM_EPOCHS):
        train_loss_bert = train(model_bert, train_loader_bert, optimizer_bert, loss_fct)
        print(f"[BERT] Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {train_loss_bert:.4f}")

    y_true_bert, y_pred_bert = evaluate(model_bert, val_loader_bert)

[BERT] Epoch 1/10, Loss: 0.0029
[BERT] Epoch 2/10, Loss: 0.0026
[BERT] Epoch 3/10, Loss: 0.0022
[BERT] Epoch 4/10, Loss: 0.0023
[BERT] Epoch 5/10, Loss: 0.0018
[BERT] Epoch 6/10, Loss: 0.0015
[BERT] Epoch 7/10, Loss: 0.0016
[BERT] Epoch 8/10, Loss: 0.0014
[BERT] Epoch 9/10, Loss: 0.0011
[BERT] Epoch 10/10, Loss: 0.0010


In [38]:
    #treino roberta
    for epoch in range(NUM_EPOCHS):
        train_loss_roberta = train(model_roberta, train_loader_roberta, optimizer_roberta, loss_fct)
        print(f"[Roberta] Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {train_loss_roberta:.4f}")

    y_true_roberta, y_pred_roberta = evaluate(model_roberta, val_loader_roberta)

[Roberta] Epoch 1/10, Loss: 0.0024
[Roberta] Epoch 2/10, Loss: 0.0024
[Roberta] Epoch 3/10, Loss: 0.0017
[Roberta] Epoch 4/10, Loss: 0.0018
[Roberta] Epoch 5/10, Loss: 0.0017
[Roberta] Epoch 6/10, Loss: 0.0013
[Roberta] Epoch 7/10, Loss: 0.0014
[Roberta] Epoch 8/10, Loss: 0.0011
[Roberta] Epoch 9/10, Loss: 0.0012
[Roberta] Epoch 10/10, Loss: 0.0010


# Salvar modelos

In [None]:
    torch.save(model_bert.state_dict(), "model-bert.pth")
    torch.save(model_roberta.state_dict(), "model-roberta.pth")

# Testar modelo roberta

In [22]:
metrics.confusion_matrix(y_true_roberta,y_pred_roberta)

array([[17,  0],
       [ 0, 20]])

In [28]:
token = tokenizer_roberta(["leve dor de barriga"], return_tensors="pt").to(DEVICE)
out = model_roberta(
    input_ids=token["input_ids"],
    attention_mask=token["attention_mask"]
)

In [35]:
print(metrics.classification_report(y_true_roberta, y_pred_roberta))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      1.00      1.00        20

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37



In [29]:
if torch.argmax(out, dim=-1)[0].cpu().item() == 0:
    print('nao urgente')
else:
    print('urgente')

nao urgente


# Testar modelo bert

In [30]:
metrics.confusion_matrix(y_true_bert,y_pred_bert)

array([[17,  0],
       [ 1, 19]])

In [33]:
token = tokenizer_bert(["leve dor de barriga"], return_tensors="pt").to(DEVICE)
out = model_bert(
    input_ids=token["input_ids"],
    attention_mask=token["attention_mask"]
)

In [36]:
print(metrics.classification_report(y_true_bert, y_pred_bert))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        17
           1       1.00      0.95      0.97        20

    accuracy                           0.97        37
   macro avg       0.97      0.97      0.97        37
weighted avg       0.97      0.97      0.97        37



In [34]:
if torch.argmax(out, dim=-1)[0].cpu().item() == 0:
    print('nao urgente')
else:
    print('urgente')

nao urgente
