# 02 - Fact Checking

This notebook is responsible for performing the fact-checking task on the claims that were extracted and normalized in the previous notebook. It loads the datasets generated previously, creates batches of jobs for fact-checking, and processes these jobs so that the LLM can classify the claims as true or false.

### Imports

In [1]:
# Native
import os
import json
import shutil
import logging

# Third-party
import torch
import sklearn
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm
from emoji import demojize
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
		Trainer,
    EarlyStoppingCallback,
)

  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
# Configure logging (safe for notebook re-runs)
root_logger = logging.getLogger()

if not root_logger.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
else:
    # Avoid duplicate handlers when re-running notebook cells: just set levels
    root_logger.setLevel(logging.INFO)
    for h in root_logger.handlers:
        h.setLevel(logging.INFO)
    # Optionally disable propagation to avoid duplicate output from external loggers
    root_logger.propagate = False

### Constants

In [3]:
# Execution Constants
TIMESTAMP = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

# Dataset Constants
DATASET_NAME = "fakebr"
DATASET_TASK = "claim_normalization"
DATASET_PROCESS_ID = "gpt-5-nano_2025-11-06_18-00-24"

# Paths Constants
DATA_PATH = f"../data/{DATASET_NAME}/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}" # Last path corresponds to the task that original data (i.e., original, claim_extraction, claim_normalization).
OUTPUT_PATH = f"../data/{DATASET_NAME}/results/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}{TIMESTAMP}"
MODEL_PATH = f"{OUTPUT_PATH}/model/"
METRICS_PATH = f"{OUTPUT_PATH}/metrics/"

# Model Constants
MODEL_NAME = "neuralmind/bert-large-portuguese-cased"
SAVE_MODEL = False

### Verify GPU Availability and Info

In [4]:
# Log GPU info
if torch.cuda.is_available():
    logging.info(
        f"Torch CUDA version: {torch.version.cuda}; GPU: {torch.cuda.get_device_name(0)}"
    )
else:
    logging.info("No GPU found, training on CPU")

2025-11-13 01:28:51,326 - INFO - Torch CUDA version: 12.4; GPU: NVIDIA GeForce RTX 4060 Ti


### Load and Setup Tokenizer

In [5]:
# Load and Setup Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, do_lower_case=False, normalization=True
)
tokenizer.demoizer = tokenizer.demojizer = lambda x: demojize(x, language="pt")

# Preprocessing Function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

### Load Dataset

In [6]:
# Map Label Function
def map_label(example):
    lab = example["label"]

    if isinstance(lab, str):
        example["label"] = label2id.get(lab, None)  # None -> will be filtered if needed

    return example

# Filter Function
def filter_missing_labels(example):
    return example["label"] is not None

# Define dataset files
train_file = DATA_PATH + 'train.csv'
test_file = DATA_PATH +  'test.csv'

# Define label mappings
label2id = {"true": 0, "fake": 1}
id2label = {v: k for k, v in label2id.items()}

# Load dataset
dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

# Rename columns
dataset = dataset.rename_column("classificacao", "label")

# Apply label mapping
dataset = dataset.map(map_label, batched=False)

# Tokenize dataset
remove_cols = [c for c in dataset["train"].column_names if c not in ("custom_id", "text", "label")]
tokenized = dataset.map(preprocess_function, batched=True, remove_columns=remove_cols)

# Filter out examples with missing labels
tokenized = tokenized.filter(filter_missing_labels)

### Load Model

In [7]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
  MODEL_NAME, 
  problem_type="single_label_classification",
  num_labels=2,
	label2id=label2id,
	id2label=id2label,
).to('cuda' if torch.cuda.is_available() else 'cpu')

# Check if model is using GPU or CPU
logging.info(f"Model device: {next(model.parameters()).device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-13 01:28:57,696 - INFO - Model device: cuda:0


### Define Metrics Computation Function

In [8]:
# Metrics Computation Function
def compute_metrics(eval_pred):
    """Compute metrics for the evaluation"""
    # Unpack predictions and labels
    preds, labels = eval_pred

    # Get predictions
    predictions = np.argmax(preds, axis=-1)

    # Load metrics
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    # Compute and return metrics
    return clf_metrics.compute(predictions=predictions, references=labels)

### Define Training Arguments

In [9]:
# Define EarlyStoppingCallback with patience of 2 epochs
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement for 2 epochs
)

# Training Arguments
training_args = TrainingArguments(
    output_dir=f"{MODEL_PATH}/checkpoints" if SAVE_MODEL else None,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=10,
    logging_strategy="epoch",
    weight_decay=0.01,
    eval_strategy="epoch",
    do_eval=True,
    save_strategy="epoch" if SAVE_MODEL else "no",
    save_total_limit=5,
    load_best_model_at_end=SAVE_MODEL,
    metric_for_best_model="eval_loss",
    fp16=False,
    bf16=True,  # Enable automatic mixed precision (Ada Lovelace Architecture).
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

  trainer = Trainer(


### Train Model

In [10]:
# Train model
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5207,0.39219,0.815716,0.796311,0.888508,0.721448
2,0.3088,0.443571,0.825452,0.806178,0.904679,0.727019
3,0.1524,0.59549,0.82128,0.801544,0.89948,0.722841


TrainOutput(global_step=720, training_loss=0.32728298505147296, metrics={'train_runtime': 286.7226, 'train_samples_per_second': 200.786, 'train_steps_per_second': 8.37, 'total_flos': 3951292511053188.0, 'train_loss': 0.32728298505147296, 'epoch': 3.0})

### Evaluate Model

In [11]:
# Check if path exists
os.makedirs(METRICS_PATH, exist_ok=True)

# Extract per-epoch metrics from log_history
log_history = getattr(trainer.state, "log_history", [])
current_training_loss = None
rows = []

for entry in log_history:
    # Only consider entries that belong to an epoch (skip step-only logs)
    if "epoch" not in entry:
        continue
    
		# Extract epoch number
    epoch = entry.get("epoch")
    row = {"epoch": epoch}
    
    # Get latest training loss
    if "loss" in entry:
        current_training_loss = entry["loss"]
        continue

		# Process epoch evaluation entry
    if "eval_loss" in entry and current_training_loss is not None:
        row["training_loss"] = current_training_loss
				
				# Collect evaluation metrics
        for k, v in entry.items():
            if k.startswith("eval_") and k != "eval_loss":
                row[k.replace("eval_", "")] = v
            elif k == "eval_loss":
                row["eval_loss"] = v
            else:
                continue
				
		# Append row to rows list
    required_keys = ["training_loss", "eval_loss", "accuracy", "f1", "precision", "recall"]
    
    if all(key in row for key in required_keys):
		    rows.append(row)

if rows:
    # Build DataFrame, sort by epoch (numerical), and write CSV/JSON
    required_columns = ["epoch", "training_loss", "eval_loss", "accuracy", "f1", "precision", "recall"]
    df = pd.DataFrame(rows)[required_columns].sort_values("epoch")
    df = df.drop_duplicates(subset=["epoch"], keep="first")

    # Get best epoch based on lowest eval_loss
    best_epoch_row = json.loads(df.loc[df['eval_loss'].idxmin()].to_json())

    # Remove training_loss from best model metrics
    best_epoch_row.pop("training_loss", None)

    # Rename all columsn to have "eval_" before except epoch and eval_loss to match expected format
    for key in list(best_epoch_row.keys()):
        if key not in ["epoch", "eval_loss"]:
            value = best_epoch_row[key]
            best_epoch_row.pop(key)
            best_epoch_row["eval_" + key] = value

    # Save best model metrics to JSON
    best_model_metrics_path = os.path.join(METRICS_PATH, "best_model_metrics.json")
    with open(best_model_metrics_path, 'w') as f:
        json.dump(best_epoch_row, f)

    # Save per-epoch metrics to CSV
    csv_path = os.path.join(METRICS_PATH, "epoch_metrics.csv")
    df.to_csv(csv_path, index=False)

    # Log saved path
    logging.info(f"Saved best-model and per-epoch metrics to {csv_path}")
else:
    logging.warning(
        "No epoch-level entries found in trainer.state.log_history."
    )

2025-11-13 01:33:44,671 - INFO - Saved best-model and per-epoch metrics to ../data/fakebr/results/claim_normalization/gpt-5-nano_2025-11-06_18-00-24/2025-11-13_01-28-51/metrics/epoch_metrics.csv


### Save Model

In [12]:
if SAVE_MODEL:
  	# Check if path exists
    os.makedirs(f"{MODEL_PATH}/best_model", exist_ok=True)

	  # Save model (Trainer save)
    trainer.save_model(f"{MODEL_PATH}/best_model")
    
    # Save Safetensors-safe Weights
    try:
        # Attempt to save weights using safetensors (no torch>=2.6 requirement to load safetensors files)
        model.save_pretrained(f"{MODEL_PATH}/best_model", safe_serialization=True)
        logging.info(f"Saved model weights using safetensors at {MODEL_PATH}")
    except Exception as e:
        logging.warning(
					  "Could not save using safetensors. If you want safetensors output, install the 'safetensors' package: pip install safetensors."
			  )
else:
		# Delete model directory if it exists
		if os.path.exists(MODEL_PATH):
				shutil.rmtree(MODEL_PATH)
				logging.info(f"Model saving disabled. Deleted model directory at {MODEL_PATH}.")

### Clean GPU VRAM

In [13]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()