# 02 - Fact Checking

This notebook is responsible for performing the fact-checking task on the claims that were extracted and normalized in the previous notebook. It loads the datasets generated previously, creates batches of jobs for fact-checking, and processes these jobs so that the LLM can classify the claims as true or false.

### Imports

In [None]:
# Native
import os
import json
import logging

# Third-party
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

# Local
from shared.prompts import (
    FACT_CHECKING_SYSTEM_MESSAGE,
)
from shared.utils import move_file_to_directory

### Setup

In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables from a .env file
load_dotenv()

# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### Constants

In [None]:
# Dataset configuration
DATASET = "faketweetbr"  # Dataset that will be used to generate new datasets after claim extraction and normalization tasks.
MODEL = "gpt-5"  # Model that will be used to generate the datasets.
DATASET_IMPORTANT_COLUMNS = [
    "text",
    "classificacao",
]  # All important columns in the dataset except index/id cloumns (custom id will be added automatically)
PROCESS_TIME = None  # This variable is used to resume the processing of a previous execution. If None, current timestamp will be used.
PROCESS_ID = f"{MODEL}_{PROCESS_TIME if PROCESS_TIME else pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"

# Directory paths
DATASET_PATH = f"../data/{DATASET}"
ORIGINAL_DATASET_PATH = DATASET_PATH + "/original"
UNPROCESSED_BATCHES_DIR = f"{DATASET_PATH}/batches/unprocessed/{PROCESS_ID}"
UPLOADED_BATCHES_DIR = f"{DATASET_PATH}/batches/uploaded/{PROCESS_ID}"
PROCESSING_BATCHES_DIR = f"{DATASET_PATH}/batches/processing/{PROCESS_ID}"
PROCESSED_BATCHES_DIR = f"{DATASET_PATH}/batches/processed/{PROCESS_ID}"
RESULTS_BATCHES_DIR = f"{DATASET_PATH}/batches/results/{PROCESS_ID}"
FAILED_BATCHES_DIR = f"{DATASET_PATH}/batches/failed/{PROCESS_ID}"

# OpenAI batch processing parameters
COMPLETION_ENDPOINT = "/v1/chat/completions"
MAX_COMPLETION_TOKENS = None
TEMPERATURE = 1  # Obs: gpt-5-nano does not support temperature=0
VERBOSITY = "low"  # Options: "low", "medium", "high"
REASONING_EFFORT = "high"  # Options: "low", "medium", "high"
ROWS_PER_BATCH = 5000  # Number of rows to process in each batch (will generate double the amount of calls due to executing two tasks: claim extraction and claim normalization)

### Create Auxiliary Directories for Processing

In [None]:
# Make sure all necessary directories exist
os.makedirs(UNPROCESSED_BATCHES_DIR, exist_ok=True)
os.makedirs(UPLOADED_BATCHES_DIR, exist_ok=True)
os.makedirs(PROCESSING_BATCHES_DIR, exist_ok=True)
os.makedirs(PROCESSED_BATCHES_DIR, exist_ok=True)
os.makedirs(RESULTS_BATCHES_DIR, exist_ok=True)
os.makedirs(FAILED_BATCHES_DIR, exist_ok=True)

### Load Claim Extraction and Normalization Tasks Datasets

### Build Fact Checking Jobs Batches

### Create Fact Checking Jobs

### Check Fact Checking Jobs Progression

### Save Fact Checking Jobs Results