In [1]:
import torch
import wandb
from sklearn.utils import resample
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score
)
from torch.nn.functional import softmax

In [2]:
# Iniciar uma nova sessão do WandB para o primeiro modelo
run = wandb.init(project="Pedidos_Respostas_LLM_Completo", name="avaliacao_modelo_sem_pesos")

# Baixar o artefato de embeddings de teste
artifact = run.use_artifact("test_embeddings:latest")  # Certifique-se de usar o nome correto do artefato
artifact_dir = artifact.download()
artifact_path = f"{artifact_dir}/test_embeddings.pt"

# Carregar os embeddings e rótulos de teste
data = torch.load(artifact_path)
test_embeddings = data['embeddings']
test_labels = data['labels']  # Aqui estão os rótulos correspondentes

[34m[1mwandb[0m: Currently logged in as: [33mloureirolucas98[0m ([33mloureirolucas98-ufrn[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Downloading large artifact test_embeddings:latest, 197.23MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.7


In [3]:
# Balancear as classes no conjunto de validação com variação estocástica
unique_classes, counts = torch.unique(test_labels, return_counts=True)
min_class_count = counts.min().item()

balanced_embeddings = []
balanced_labels = []

for cls in unique_classes:
    # Filtrar embeddings e labels da classe atual
    cls_mask = (test_labels == cls)
    cls_embeddings = test_embeddings[cls_mask]
    cls_labels = test_labels[cls_mask]

    # Embaralhar índices de dados da classe
    indices = torch.randperm(len(cls_embeddings))[:min_class_count]

    # Selecionar amostras balanceadas com variação estocástica
    cls_embeddings_balanced = cls_embeddings[indices]
    cls_labels_balanced = cls_labels[indices]

    balanced_embeddings.append(cls_embeddings_balanced)
    balanced_labels.append(cls_labels_balanced)

# Concatenar os embeddings e labels balanceados
balanced_embeddings = torch.cat(balanced_embeddings, dim=0)
balanced_labels = torch.cat(balanced_labels, dim=0)

# Mostrar tamanhos antes e depois do balanceamento
print(f"Tamanho antes do balanceamento: {len(test_labels)}")
print(f"Tamanho após o balanceamento: {len(balanced_labels)}")


balanced_dataset = TensorDataset(balanced_embeddings, balanced_labels)
balanced_dataloader = DataLoader(balanced_dataset, batch_size=32, shuffle=False)

Tamanho antes do balanceamento: 67147
Tamanho após o balanceamento: 34550


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Definir o nome do modelo salvo no Hugging Face
model_name = "Lorero/bert-treinado-pedidos-completo"

# Carregar o modelo e o tokenizer do Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Configurar o dispositivo
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [5]:
# Avaliação no conjunto balanceado
model.eval()
val_loss = 0
val_true_labels = []
val_predicted_labels = []
val_probabilities = []

with torch.no_grad():
    progress_bar = tqdm(balanced_dataloader, desc="Validando", leave=False)
    for batch in progress_bar:
        embeddings = batch[0].to(device).unsqueeze(1)
        labels = batch[1].to(device)

        outputs = model(inputs_embeds=embeddings, labels=labels)
        logits = outputs.logits
        val_loss += outputs.loss.item()

        # Obter probabilidades e predições
        probabilities = softmax(logits, dim=-1).cpu().numpy()
        predictions = torch.argmax(logits, dim=-1)

        val_true_labels.extend(labels.cpu().numpy())
        val_predicted_labels.extend(predictions.cpu().numpy())
        val_probabilities.extend(probabilities)

# Calcular métricas de validação
val_accuracy = accuracy_score(val_true_labels, val_predicted_labels)
val_f1 = f1_score(val_true_labels, val_predicted_labels, average="weighted")
val_precision = precision_score(val_true_labels, val_predicted_labels, average="weighted")
val_recall = recall_score(val_true_labels, val_predicted_labels, average="weighted")
val_mcc = matthews_corrcoef(val_true_labels, val_predicted_labels)

# Calcular ROC-AUC para problemas binários
if len(set(val_true_labels)) == 2:  # Apenas para classificação binária
    val_roc_auc = roc_auc_score(val_true_labels, [prob[1] for prob in val_probabilities])
else:
    val_roc_auc = None  # ROC-AUC não aplicável para classificação multiclasse

# Logar métricas no WandB
wandb.log({
    "val_loss": val_loss / len(balanced_dataloader),
    "val_accuracy": val_accuracy,
    "val_f1": val_f1,
    "val_precision": val_precision,
    "val_recall": val_recall,
    "val_mcc": val_mcc,
    "val_roc_auc": val_roc_auc if val_roc_auc else None,
})

print(f"Modelo sem pesos - Validation - Loss: {val_loss / len(balanced_dataloader):.4f} - "
      f"Accuracy: {val_accuracy:.4f} - F1: {val_f1:.4f} - "
      f"Precision: {val_precision:.4f} - Recall: {val_recall:.4f} - "
      f"MCC: {val_mcc:.4f} - ROC-AUC: {val_roc_auc if val_roc_auc else 'N/A'}")

# Finalizar o WandB
run.finish()

Validando:   0%|          | 0/1080 [00:00<?, ?it/s]

Modelo sem pesos - Validation - Loss: 0.6850 - Accuracy: 0.7032 - F1: 0.6866 - Precision: 0.7576 - Recall: 0.7032 - MCC: 0.4575 - ROC-AUC: 0.7992824208711969


0,1
val_accuracy,▁
val_f1,▁
val_loss,▁
val_mcc,▁
val_precision,▁
val_recall,▁
val_roc_auc,▁

0,1
val_accuracy,0.70318
val_f1,0.68664
val_loss,0.68496
val_mcc,0.45755
val_precision,0.75759
val_recall,0.70318
val_roc_auc,0.79928


In [6]:
# Iniciar uma nova sessão do WandB para o segundo modelo
run = wandb.init(project="Pedidos_Respostas_LLM_Completo", name="avaliacao_modelo_com_pesos")

# Definir o nome do modelo salvo no Hugging Face
model_name = "Lorero/bert-treinado-pedidos-completo-v2"

# Carregar o modelo e o tokenizer do Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Configurar o dispositivo
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Avaliação no conjunto balanceado
model.eval()
val_loss = 0
val_true_labels = []
val_predicted_labels = []
val_probabilities = []

with torch.no_grad():
    progress_bar = tqdm(balanced_dataloader, desc="Validando", leave=False)
    for batch in progress_bar:
        embeddings = batch[0].to(device).unsqueeze(1)
        labels = batch[1].to(device)

        outputs = model(inputs_embeds=embeddings, labels=labels)
        logits = outputs.logits
        val_loss += outputs.loss.item()

        # Obter probabilidades e predições
        probabilities = softmax(logits, dim=-1).cpu().numpy()
        predictions = torch.argmax(logits, dim=-1)

        val_true_labels.extend(labels.cpu().numpy())
        val_predicted_labels.extend(predictions.cpu().numpy())
        val_probabilities.extend(probabilities)

# Calcular métricas de validação
val_accuracy = accuracy_score(val_true_labels, val_predicted_labels)
val_f1 = f1_score(val_true_labels, val_predicted_labels, average="weighted")
val_precision = precision_score(val_true_labels, val_predicted_labels, average="weighted")
val_recall = recall_score(val_true_labels, val_predicted_labels, average="weighted")
val_mcc = matthews_corrcoef(val_true_labels, val_predicted_labels)

# Calcular ROC-AUC para problemas binários
if len(set(val_true_labels)) == 2:  # Apenas para classificação binária
    val_roc_auc = roc_auc_score(val_true_labels, [prob[1] for prob in val_probabilities])
else:
    val_roc_auc = None  # ROC-AUC não aplicável para classificação multiclasse

# Exibir métricas
print(f"Validation - Loss: {val_loss / len(balanced_dataloader):.4f} - "
      f"Accuracy: {val_accuracy:.4f} - F1: {val_f1:.4f} - "
      f"Precision: {val_precision:.4f} - Recall: {val_recall:.4f} - "
      f"MCC: {val_mcc:.4f} - ROC-AUC: {val_roc_auc if val_roc_auc else 'N/A'}")

# Logar métricas no WandB
wandb.log({
    "model_name": "bert-modelo-com-pesos",
    "val_loss": val_loss / len(balanced_dataloader),
    "val_accuracy": val_accuracy,
    "val_f1": val_f1,
    "val_precision": val_precision,
    "val_recall": val_recall,
    "val_mcc": val_mcc,
    "val_roc_auc": val_roc_auc if val_roc_auc else None,
})

# Finalizar o WandB
run.finish()

Validando:   0%|          | 0/1080 [00:00<?, ?it/s]

Validation - Loss: 0.6934 - Accuracy: 0.5000 - F1: 0.3333 - Precision: 0.2500 - Recall: 0.5000 - MCC: 0.0000 - ROC-AUC: 0.549753979739508


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
val_accuracy,▁
val_f1,▁
val_loss,▁
val_mcc,▁
val_precision,▁
val_recall,▁
val_roc_auc,▁

0,1
model_name,bert-modelo-com-peso...
val_accuracy,0.5
val_f1,0.33333
val_loss,0.69336
val_mcc,0
val_precision,0.25
val_recall,0.5
val_roc_auc,0.54975
