# Utilizando embeddings do BERTimbau como entrada em um modelo de regressão logística

In [8]:
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
from sklearn.linear_model import LogisticRegression
import os

In [9]:
#Importando o Modelo BERTimbau
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [10]:
def getEmbeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)    
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [11]:
def load_corpus_fakeBR():
    base_path = "fakebr/size_normalized_texts"

    fake_news = []
    true_news = []
    for root, dirs, files in os.walk(base_path + "/fake"):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    fake_news.append([text, 0])

    for root, dirs, files in os.walk(base_path + "/true"):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    true_news.append([text, 1])


    all_news = fake_news + true_news
    return pd.DataFrame(all_news, columns=['text', 'label'])

In [12]:
df = load_corpus_fakeBR()
df.head()

Unnamed: 0,text,label
0,Kátia Abreu diz que vai colocar sua expulsão e...,0
1,"Dr. Ray peita Bolsonaro, chama-o de “conservad...",0
2,Reinaldo Azevedo desmascarado pela Polícia Fed...,0
3,Relatório assustador do BNDES mostra dinheiro ...,0
4,"Radialista americano fala sobre o PT: ""Eles ve...",0


In [13]:
embeddings = [getEmbeddings(text, tokenizer, model) for text in df['text']]

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_predict
import numpy as np

In [15]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

logModel = LogisticRegression(max_iter=1000)

In [17]:
accuracies, precisions, recalls, f1_scores, macro_f1s = [], [], [], [], []

for train_index, test_index in kfolds.split(embeddings):
    x_train, x_test = np.array(embeddings)[train_index], np.array(embeddings)[test_index]
    y_train, y_test = np.array(df['label'])[train_index], np.array(df['label'])[test_index]
    logModel.fit(x_train, y_train)
    y_pred = logModel.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')  # Para classificação binária
    recall = recall_score(y_test, y_pred, average='binary')  # Para classificação binária
    f1 = f1_score(y_test, y_pred, average='binary')  # Para classificação binária
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    macro_f1s.append(macro_f1)

In [21]:
# Criar um DataFrame com os resultados
results = pd.DataFrame({
    'accuracy': [np.mean(accuracies)],
    'precision': [np.mean(precisions)],
    'recall': [np.mean(recalls)],
    'f1': [np.mean(f1_scores)],
    'macro_f1': [np.mean(macro_f1s)]
})

# Salvar o DataFrame em um arquivo CSV
results.to_csv("logistic_results.csv", index=False)


In [22]:
results

Unnamed: 0,accuracy,precision,recall,f1,macro_f1
0,0.968333,0.967285,0.969503,0.968386,0.968326


# Testing

In [40]:
news = ["China é um veneno mundial!", "Vacinas salvam vidas", "Bolsonarismo representa a volta do terror", "Lula é ladrão"]
news_embeddings = np.array([getEmbeddings(text, tokenizer, model) for text in news])
pred = logModel.predict(news_embeddings)
print(pred)

[0 1 1 0]


In [41]:
from joblib import dump

dump(logModel,  'logistic_model.joblib') #Salva o modelo

['logistic_model.joblib']

In [42]:
from joblib import load

loaded_model = load('logistic_model.joblib')
pred_load = loaded_model.predict(news_embeddings)
print(pred_load)

[0 1 1 0]
