# 02 - Fact Checking

This notebook is responsible for performing the fact-checking task on the claims that were extracted and normalized in the previous notebook. It loads the datasets generated previously, creates batches of jobs for fact-checking, and processes these jobs so that the LLM can classify the claims as true or false.

### Imports

In [None]:
# Native
import os
import json
import logging

# Third-party
import torch
import sklearn
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm
from emoji import demojize
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
		Trainer,
)

### Setup

In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

### Constants

In [None]:
# Execution Constants
TIMESTAMP = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

# Dataset Constants
DATASET_NAME = "faketweetbr"
DATASET_TASK = "original"
DATASET_PROCESS_ID = ""

# Paths Constants
DATA_PATH = f"../data/{DATASET_NAME}/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}" # Last path corresponds to the task that original data (i.e., original, claim_extraction, claim_normalization).
OUTPUT_PATH = f"../data/{DATASET_NAME}/results/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}{TIMESTAMP}"
MODEL_PATH = f"{OUTPUT_PATH}/model/"
METRICS_PATH = f"{OUTPUT_PATH}/metrics/"

# Model Constants
MODEL_NAME = "neuralmind/bert-large-portuguese-cased"

### Verify GPU Availability and Info

In [None]:
# Log GPU info
if torch.cuda.is_available():
    logging.info(
        f"Torch CUDA version: {torch.version.cuda}; GPU: {torch.cuda.get_device_name(0)}"
    )
else:
    logging.info("No GPU found, training on CPU")

### Load and Setup Tokenizer

In [None]:
# Load and Setup Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, do_lower_case=False, normalization=True
)
tokenizer.demoizer = tokenizer.demojizer = lambda x: demojize(x, language="pt")

# Preprocessing Function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

### Load Dataset

In [None]:
# Map Label Function
def map_label(example):
    lab = example["label"]

    if isinstance(lab, str):
        example["label"] = label2id.get(lab, None)  # None -> will be filtered if needed

    return example

# Filter Function
def filter_missing_labels(example):
    return example["label"] is not None

# Define dataset files
train_file = DATA_PATH + 'train.csv'
test_file = DATA_PATH +  'test.csv'

# Define label mappings
label2id = {"true": 0, "fake": 1}
id2label = {v: k for k, v in label2id.items()}

# Load dataset
dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

# Rename columns
dataset = dataset.rename_column("classificacao", "label")

# Apply label mapping
dataset = dataset.map(map_label, batched=False)

# Tokenize dataset
remove_cols = [c for c in dataset["train"].column_names if c not in ("custom_id", "text", "label")]
tokenized = dataset.map(preprocess_function, batched=True, remove_columns=remove_cols)

# Filter out examples with missing labels
tokenized = tokenized.filter(filter_missing_labels)

### Load Model

In [None]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
  MODEL_NAME, 
  problem_type="single_label_classification",
  num_labels=2,
	label2id=label2id,
	id2label=id2label,
).to('cuda' if torch.cuda.is_available() else 'cpu')

# Check if model is using GPU or CPU
logging.info(f"Model device: {next(model.parameters()).device}")

### Define Metrics Computation Function

In [None]:
# Metrics Computation Function
def compute_metrics(eval_pred):
    """Compute metrics for the evaluation"""
    # Unpack predictions and labels
    preds, labels = eval_pred

    # Get predictions
    predictions = np.argmax(preds, axis=-1)

    # Load metrics
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    # Compute and return metrics
    return clf_metrics.compute(predictions=predictions, references=labels)

### Define Training Arguments

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=5,
    logging_strategy="epoch",
    weight_decay=0.01,
    eval_strategy="epoch",
    do_eval=True,
    save_strategy="epoch",
    save_total_limit=5,
    load_best_model_at_end=True,
    fp16=False,
    bf16=True,  # Enable automatic mixed precision (Ada Lovelace Architecture).
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

### Train Model

In [None]:
# Train model
trainer.train()

### Evaluate Model

In [None]:
# check if path exists
os.makedirs(METRICS_PATH, exist_ok=True)

# Evaluate model
metrics = trainer.evaluate()

# Log metrics
logging.info(f"Evaluation Metrics:")
for key, value in metrics.items():
		logging.info(f"  {key}: {value}")

# Save metrics
with open(os.path.join(METRICS_PATH, "metrics.json"), "w") as f:
    json.dump(metrics, f)

### Save Model

In [None]:
# Check if path exists
os.makedirs(MODEL_PATH, exist_ok=True)

# Save model (Trainer save)
trainer.save_model(MODEL_PATH)

# Save Safetensors-safe Weights
try:
    # Attempt to save weights using safetensors (no torch>=2.6 requirement to load safetensors files)
    model.save_pretrained(MODEL_PATH, safe_serialization=True)
    logging.info(f"Saved model weights using safetensors at {MODEL_PATH}")
except Exception as e:
    logging.warning(
        "Could not save using safetensors. If you want safetensors output, install the 'safetensors' package: pip install safetensors."
    )