In [None]:
!pip uninstall -y transformers
!pip install transformers==4.46.3 datasets==3.0.2 accelerate==0.34.2



In [None]:
from google.colab import files  #upload dos datasets finais de treino e teste

uploaded = files.upload()


In [None]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_csv("/content/train_dataset.csv", sep=";")
test_df = pd.read_csv("/content/test_dataset.csv", sep=";")


train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification   #carregando modelo bert e o tokenizador

MODEL_NAME = "neuralmind/bert-base-portuguese-cased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2  #camada final com duas saidas (binario)
)


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["texto"],
        padding="max_length",   # preenche o texto do datasets até o tamanho máximo
        truncation=True,        # corta se for muito longo
        max_length=256          # numero de tokens
    )


train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)


In [None]:
#hugging face para configurar e treinar
from transformers import TrainingArguments, Trainer

# scikit-learn p calcular métricas de desempenho
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import numpy as np
# função do Trainer para calcular as métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    #calcula metricas
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)


    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"  # Desativa o weights & biases


training_args = TrainingArguments(
    output_dir="/content/bertimbau_fake_news",
    evaluation_strategy="epoch",               # avalia  a cada época
    save_strategy="epoch",                 # salva checkpoint a cada época
    learning_rate=2e-5,                    # taxa aprendizado
    per_device_train_batch_size=8,         #lote treino
    per_device_eval_batch_size=8,          #lote avaliação
    num_train_epochs=5,
    weight_decay=0.01,             #regula p evitar overfitting
    logging_dir="/content/logs",               #logs
    load_best_model_at_end=True    #carrega melhor modelo no final
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,  # dados tokenizados
    eval_dataset=test_tokenized,    # dados teste tokenizados
    tokenizer=tokenizer,            # tokenizer p texto
    compute_metrics=compute_metrics # função das métricasdesempenho
)


trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
model.save_pretrained("/content/bertimbau_fakenews_final")
tokenizer.save_pretrained("/content/bertimbau_fakenews_final")


In [None]:
from google.colab import drive   #p salvar no meu drive
drive.mount('/content/drive')

!cp -r /content/bertimbau_fake_news /content/drive/MyDrive/


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!ls /content/drive/MyDrive/bertimbau_fake_news


In [None]:
!zip -r /content/modelo_fakenews_final.zip "/content/drive/MyDrive/bertimbau_fake_news/checkpoint-7450"
from google.colab import files
files.download("/content/modelo_fakenews_final.zip")  #zipar o ultimo checkpoint e baixar
