In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from joblib import load
import pandas as pd

In [2]:
def load_corpus_factbr():
    path = "factckbr/corpus.tsv"
    df = pd.read_csv(path, sep='\t', header=0)
    
    true = df[(df["alternativeName"] == "Verdadeiro") | (df["alternativeName"] == "verdadeiro")]["claimReviewed"]
    false = df[(df["alternativeName"] == "falso") | (df["alternativeName"] == "Falso")]["claimReviewed"]
    
    true_news = list(zip(true, [1] * len(true)))
    fake_news = list(zip(false, [0] * len(false)))
    
    all_news = true_news + fake_news
    
    return pd.DataFrame(all_news, columns=['text', 'label'])

In [3]:
df = load_corpus_factbr()

In [4]:
df.head()

Unnamed: 0,text,label
0,"Papa envia terço a Lula, preso político há 67 ...",1
1,Hoje a classificação do Brasil é superior à da...,1
2,Haddad é réu por improbidade em ação que apura...,1
3,'Haddad é acusado de enriquecimento ilícito po...,1
4,'addad é acusado de improbidade em ação que in...,1


In [5]:
#Testando com o Fine Tuning Bertimbau
tokenizer_path = "../tokenizer_BERTimbau"
model_path = "../model_BERTimbau"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [14]:
print(df[df['text'].isna()])

     text  label
118   NaN      1
313   NaN      0
320   NaN      0
329   NaN      0
353   NaN      0
372   NaN      0
379   NaN      0
386   NaN      0
402   NaN      0
958   NaN      0
1028  NaN      0


In [15]:
df = df.dropna(subset=['text'])

In [16]:
def tokenize(data):
  return tokenizer(data['text'], padding='max_length', truncation=True, max_length=512)

In [17]:
from datasets import Dataset

In [18]:
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize)

Map:   0%|          | 0/1052 [00:00<?, ? examples/s]

In [19]:
tokenized_dataset = tokenized_dataset.remove_columns(['text'])
tokenized_dataset.set_format("torch")

In [20]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [21]:
from torch.utils.data import DataLoader

batch_size = 16  # Ajuste conforme necessário
test_dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False)

# Coletar previsões e rótulos verdadeiros
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='binary')  # Para classificação binária
recall = recall_score(all_labels, all_preds, average='binary')  # Para classificação binária
f1 = f1_score(all_labels, all_preds, average='binary')  # Para classificação binária
macro_f1 = f1_score(all_labels, all_preds, average='macro')

In [26]:
results = {
        "accuracy": [accuracy],
        "precision": [precision],
        "recall": [recall],
        "f1": [f1],
        "macro_f1": [macro_f1]
    }
print(results)
# Salvar em CSV
df_metrics = pd.DataFrame(results)
df_metrics.to_csv("metrics_test_finetuning.csv", index=False)

{'accuracy': [0.4714828897338403], 'precision': [0.1264957264957265], 'recall': [0.6218487394957983], 'f1': [0.21022727272727273], 'macro_f1': [0.4065422077922078]}


In [27]:
from transformers import AutoModel
#Importando o Modelo BERTimbau
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [28]:
loaded_model = load('logistic_model.joblib')

In [29]:
def getEmbeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)    
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [32]:
import numpy as np
news_embeddings = np.array([getEmbeddings(text, tokenizer, model) for text in df['text']])
labels = np.array(df['label'])

In [33]:

pred = loaded_model.predict(news_embeddings)
accuracy = accuracy_score(labels, pred)
precision = precision_score(labels, pred, average='binary')  # Para classificação binária
recall = recall_score(labels, pred, average='binary')  # Para classificação binária
f1 = f1_score(labels, pred, average='binary')  # Para classificação binária
macro_f1 = f1_score(labels, pred, average='macro')

In [34]:
results = {
        "accuracy": [accuracy],
        "precision": [precision],
        "recall": [recall],
        "f1": [f1],
        "macro_f1": [macro_f1]
    }
print(results)
# Salvar em CSV
df_metrics = pd.DataFrame(results)
df_metrics.to_csv("metrics_test_log.csv", index=False)

{'accuracy': [0.6730038022813688], 'precision': [0.14511041009463724], 'recall': [0.3865546218487395], 'f1': [0.21100917431192662], 'macro_f1': [0.5023870811607595]}
