# 02 - Fact Checking

This notebook is responsible for performing the fact-checking task on the claims that were extracted and normalized in the previous notebook. It loads the datasets generated previously, creates batches of jobs for fact-checking, and processes these jobs so that the LLM can classify the claims as true or false.

### Imports

In [9]:
# Native
import logging

# Third-party
import torch
import sklearn
import evaluate
import numpy as np
from tqdm import tqdm
from emoji import demojize
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
		Trainer,
)

### Setup

In [10]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

### Constants

In [11]:
# Dataset Constants
DATASET_NAME = "faketweetbr"
DATASET_TASK = "original"
DATASET_PROCESS_ID = ""

# Paths Constants
DATA_PATH = f"../data/{DATASET_NAME}/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}" # Last path corresponds to the task that original data (i.e., original, claim_extraction, claim_normalization).
OUTPUT_PATH = f"../data/{DATASET_NAME}/results/{DATASET_TASK}/{DATASET_PROCESS_ID + "/" if DATASET_PROCESS_ID else ""}"

# Model Constants
MODEL_NAME = "neuralmind/bert-large-portuguese-cased"

### Verify GPU Availability and Info

In [20]:
logging.info(f"Torch CUDA available: {torch.cuda.is_available()}")

# Log GPU info
if torch.cuda.is_available():
    logging.info(f"Torch CUDA version: {torch.version.cuda}")
    logging.info(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    logging.info("No GPU found, training on CPU")

2025-11-10 10:47:54,320 - INFO - Torch CUDA available: True
2025-11-10 10:47:54,321 - INFO - Torch CUDA version: 12.1
2025-11-10 10:47:54,322 - INFO - GPU: NVIDIA GeForce RTX 4060 Ti


### Load and Setup Tokenizer

In [None]:
# Load and Setup Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, do_lower_case=False, normalization=True
)
tokenizer.demoizer = tokenizer.demojizer = lambda x: demojize(x, language="pt")

# Preprocessing Function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

NameError: name 'AutoTokenizer' is not defined

### Load Dataset

In [None]:
# Define dataset files
train_file = DATA_PATH + 'train.csv'
test_file = DATA_PATH +  'test.csv'

# Load dataset
dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

# Rename columns
dataset = dataset.rename_column("classificacao", "label")

# Tokenize dataset
tokenized = dataset.map(preprocess_function, batched=True)

### Load Model

In [None]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
  MODEL_NAME, 
  problem_type="multi_label_classification", 
  num_labels=2, 
  id2label={0: 'true', 1: 'fake'}, 
  label2id={'true': 0, 'fake': 1}
)

# Check if model is using GPU
if torch.cuda.is_available():
		logging.info(f"Model device: {next(model.parameters()).device}")

### Fine-tune Model

In [None]:
# Metrics Function
def compute_metrics(eval_pred):
    """Compute metrics for the evaluation"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print(predictions, labels)

    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    return clf_metrics.compute(predictions=predictions, references=labels)

# Training Arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    do_eval=True,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,  # Enable automatic mixed precision.
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Evaluate model
trainer.evaluate()

# Save model
trainer.save_model(OUTPUT_PATH)

IndentationError: unindent does not match any outer indentation level (<string>, line 10)