In [None]:
%pip install transformers

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the preprocessed train dataset
train_file_path = 'https://raw.githubusercontent.com/EduardoCavValenca/Automatic-Detection-of-Fake-News-in-Portuguese/main/data/csvs/train.csv'
df = pd.read_csv(train_file_path)
df.head()

Unnamed: 0,content,label,rating
0,﻿juiz determina soltura de 4 presos por pensão...,1,0.031619
1,"pivô do mensalão, jefferson não assistiu à tra...",1,0.033124
2,oficial da reserva avisa: se a lei do impeachm...,0,0.008723
3,bolsonaro vai para o psl e liberais abandonam ...,1,0.016806
4,"por maioria de votos, os ministros da 2.a tur...",1,0.021402


In [4]:
# Extract texts and labels
texts = df['content'].tolist()
labels = df['label'].tolist()

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [8]:
# Function to evaluate a model
def evaluate_model(clf, X_train, y_train, X_test, y_test):
  # Configuração do RepeatedKFold
  rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=42)
  
  # Realizando a validação cruzada no conjunto de treino
  accuracy_scores = cross_val_score(clf, X_train, y_train, cv=rkf, scoring='accuracy')
  
  # Treinando o modelo no conjunto de treino completo
  clf.fit(X_train, y_train)
  
  # Fazendo previsões no conjunto de teste
  y_pred = clf.predict(X_test)
  
  print(f"Cross-validation Accuracy (mean ± std): {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
  print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
  print(classification_report(y_test, y_pred))

## Bert

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
# Use a pre-trained English model (e.g., BERT)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode sequences in the training and test sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Transform tokenized encodings into the format suitable for scikit-learn models
X_train = np.array(train_encodings['input_ids'])
X_test = np.array(test_encodings['input_ids'])
y_train = np.array(train_labels)
y_test = np.array(test_labels)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Evaluate RandomForest
print("RandomForest Classifier:")
rf_clf = RandomForestClassifier()
evaluate_model(rf_clf, X_train, y_train, X_test, y_test)

RandomForest Classifier:
Cross-validation Accuracy (mean ± std): 0.8915 ± 0.0090
Test Accuracy: 0.9071
              precision    recall  f1-score   support

           0       0.98      0.84      0.90       594
           1       0.85      0.98      0.91       558

    accuracy                           0.91      1152
   macro avg       0.91      0.91      0.91      1152
weighted avg       0.92      0.91      0.91      1152



In [9]:
# Evaluate MLP
print("\nMLP Classifier:")
mlp_clf = MLPClassifier()
evaluate_model(mlp_clf, X_train, y_train, X_test, y_test)


MLP Classifier:
Cross-validation Accuracy (mean ± std): 0.7858 ± 0.0165
Test Accuracy: 0.8290
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       594
           1       0.83      0.81      0.82       558

    accuracy                           0.83      1152
   macro avg       0.83      0.83      0.83      1152
weighted avg       0.83      0.83      0.83      1152



In [10]:
# Evaluate SVM
print("\nSVM Classifier:")
svm_clf = SVC()
evaluate_model(svm_clf, X_train, y_train, X_test, y_test)


SVM Classifier:
Cross-validation Accuracy (mean ± std): 0.8804 ± 0.0099
Test Accuracy: 0.8941
              precision    recall  f1-score   support

           0       0.98      0.81      0.89       594
           1       0.83      0.98      0.90       558

    accuracy                           0.89      1152
   macro avg       0.90      0.90      0.89      1152
weighted avg       0.91      0.89      0.89      1152



In [11]:
# Evaluate DecisionTree
print("\nDecisionTree Classifier:")
dt_clf = DecisionTreeClassifier()
evaluate_model(dt_clf, X_train, y_train, X_test, y_test)


DecisionTree Classifier:
Cross-validation Accuracy (mean ± std): 0.8240 ± 0.0128
Test Accuracy: 0.8481
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       594
           1       0.84      0.85      0.84       558

    accuracy                           0.85      1152
   macro avg       0.85      0.85      0.85      1152
weighted avg       0.85      0.85      0.85      1152



## Bertimbau

In [5]:
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads

model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)


# Tokenize and encode sequences in the training and test sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Transform tokenized encodings into the format suitable for scikit-learn models
X_train = np.array(train_encodings['input_ids'])
X_test = np.array(test_encodings['input_ids'])
y_train = np.array(train_labels)
y_test = np.array(test_labels)

### Sem ajuste

In [9]:
# Evaluate RandomForest
print("RandomForest Classifier:")
rf_clf = RandomForestClassifier()
evaluate_model(rf_clf, X_train, y_train, X_test, y_test)

RandomForest Classifier:


In [9]:
# Evaluate MLP
print("\nMLP Classifier:")
mlp_clf = MLPClassifier()
evaluate_model(mlp_clf, X_train, y_train, X_test, y_test)


MLP Classifier:
Cross-validation Accuracy (mean ± std): 0.8737 ± 0.0118
Test Accuracy: 0.8767
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       594
           1       0.87      0.88      0.87       558

    accuracy                           0.88      1152
   macro avg       0.88      0.88      0.88      1152
weighted avg       0.88      0.88      0.88      1152



In [10]:
# Evaluate SVM
print("\nSVM Classifier:")
svm_clf = SVC()
evaluate_model(svm_clf, X_train, y_train, X_test, y_test)


SVM Classifier:
Cross-validation Accuracy (mean ± std): 0.9317 ± 0.0078
Test Accuracy: 0.9384
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       594
           1       0.92      0.96      0.94       558

    accuracy                           0.94      1152
   macro avg       0.94      0.94      0.94      1152
weighted avg       0.94      0.94      0.94      1152



In [11]:
# Evaluate DecisionTree
print("\nDecisionTree Classifier:")
dt_clf = DecisionTreeClassifier()
evaluate_model(dt_clf, X_train, y_train, X_test, y_test)


DecisionTree Classifier:
Cross-validation Accuracy (mean ± std): 0.8934 ± 0.0089
Test Accuracy: 0.8993
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       594
           1       0.91      0.88      0.89       558

    accuracy                           0.90      1152
   macro avg       0.90      0.90      0.90      1152
weighted avg       0.90      0.90      0.90      1152



### Ajuste de parâmetros

#### RF:
bootstrap: [True, False]  
max_depth: [5,10, 20, 30, 40, 50]  
max_features: [‘auto’, ‘sqrt’, ‘log2’]  
min_samples_leaf: [1, 2, 4]  
min_samples_split: [2, 5, 10]  
n_estimators: [200, 400, 600, 800, 1000]  
criterion: [‘gini’, ‘entropy’]  


In [None]:
# Evaluate RandomForest
print("RandomForest Classifier:")
rf_clf = RandomForestClassifier(bootstrap=True, max_depth=5, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 200, criterion='gini')
evaluate_model(rf_clf, X_train, y_train, X_test, y_test)

RandomForest Classifier:


NameError: name 'evaluate_model' is not defined

## mBert

In [12]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')


# Tokenize and encode sequences in the training and test sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Transform tokenized encodings into the format suitable for scikit-learn models
X_train = np.array(train_encodings['input_ids'])
X_test = np.array(test_encodings['input_ids'])
y_train = np.array(train_labels)
y_test = np.array(test_labels)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
# Evaluate RandomForest
print("RandomForest Classifier:")
rf_clf = RandomForestClassifier()
evaluate_model(rf_clf, X_train, y_train, X_test, y_test)

RandomForest Classifier:
Cross-validation Accuracy (mean ± std): 0.9372 ± 0.0075
Test Accuracy: 0.9384
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       594
           1       0.92      0.96      0.94       558

    accuracy                           0.94      1152
   macro avg       0.94      0.94      0.94      1152
weighted avg       0.94      0.94      0.94      1152



In [14]:
# Evaluate MLP
print("\nMLP Classifier:")
mlp_clf = MLPClassifier()
evaluate_model(mlp_clf, X_train, y_train, X_test, y_test)


MLP Classifier:
Cross-validation Accuracy (mean ± std): 0.8566 ± 0.0147
Test Accuracy: 0.8620
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       594
           1       0.86      0.86      0.86       558

    accuracy                           0.86      1152
   macro avg       0.86      0.86      0.86      1152
weighted avg       0.86      0.86      0.86      1152



In [15]:
# Evaluate SVM
print("\nSVM Classifier:")
svm_clf = SVC()
evaluate_model(svm_clf, X_train, y_train, X_test, y_test)


SVM Classifier:
Cross-validation Accuracy (mean ± std): 0.9248 ± 0.0083
Test Accuracy: 0.9366
              precision    recall  f1-score   support

           0       0.97      0.90      0.94       594
           1       0.90      0.97      0.94       558

    accuracy                           0.94      1152
   macro avg       0.94      0.94      0.94      1152
weighted avg       0.94      0.94      0.94      1152



In [16]:
# Evaluate DecisionTree
print("\nDecisionTree Classifier:")
dt_clf = DecisionTreeClassifier()
evaluate_model(dt_clf, X_train, y_train, X_test, y_test)


DecisionTree Classifier:
Cross-validation Accuracy (mean ± std): 0.8771 ± 0.0101
Test Accuracy: 0.8776
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       594
           1       0.87      0.88      0.87       558

    accuracy                           0.88      1152
   macro avg       0.88      0.88      0.88      1152
weighted avg       0.88      0.88      0.88      1152

