# 01 - Generate Datasets

This notebook is responsible for generating two new datasets from an original dataset. The process consists on passing the original dataset through two tasks: claim extraction and claim normalization. The outputs of each task is the saved to be used in the next steps of the project.

### Imports

In [1]:
# Native
import os
import json
import logging

# Third-party
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

# Local
from shared.prompts import (
    CLAIM_EXTRACTION_SYSTEM_MESSAGE,
    CLAIM_NORMALIZATION_SYSTEM_MESSAGE,
    CLAIM_EXTRACTION_NORMALIZATION_SYSTEM_MESSAGE,
)
from shared.utils import move_file_to_directory

### Setup

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables from a .env file
load_dotenv()

# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### Constants

In [None]:
# Dataset configuration
DATASET = "fakebr" # Dataset that will be used to generate new datasets after claim extraction and normalization tasks.
MODEL = "gpt-5-nano" # Model that will be used to generate the datasets.
DATASET_IMPORTANT_COLUMNS = ["text", "classificacao"] # All important columns in the dataset except index/id cloumns (custom id will be added automatically)
PROCESS_TIME = "2025-11-06_17-58-17"  # This variable is used to resume the processing of a previous execution. If None, current timestamp will be used (new execution).
PROCESS_ID = f"{MODEL}_{PROCESS_TIME if PROCESS_TIME else pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"
TASKS = [
  "claim_extraction", # Task to extract claims from the original text.
  "claim_normalization", # Task to normalize the extracted claims.
  # "claim_extraction_normalization" # Task to normalize previously extracted claims.
] # Tasks to be executed in the dataset generation job.

# Directory paths and files
DATASET_PATH = f"../data/{DATASET}"
ORIGINAL_DATASET_PATH = DATASET_PATH + "/original"
ORIGINAL_DATASET_FILES = [f"{ORIGINAL_DATASET_PATH}/train.csv", f"{ORIGINAL_DATASET_PATH}/test.csv"]
UNPROCESSED_BATCHES_DIR = f"{DATASET_PATH}/batches/datasets_generation_jobs/unprocessed/{PROCESS_ID}"
UPLOADED_BATCHES_DIR = f"{DATASET_PATH}/batches/datasets_generation_jobs/uploaded/{PROCESS_ID}"
PROCESSING_BATCHES_DIR = f"{DATASET_PATH}/batches/datasets_generation_jobs/processing/{PROCESS_ID}"
PROCESSED_BATCHES_DIR = f"{DATASET_PATH}/batches/datasets_generation_jobs/processed/{PROCESS_ID}"
RESULTS_BATCHES_DIR = f"{DATASET_PATH}/batches/datasets_generation_jobs/results/{PROCESS_ID}"
FAILED_BATCHES_DIR = f"{DATASET_PATH}/batches/datasets_generation_jobs/failed/{PROCESS_ID}"

# OpenAI batch processing parameters
COMPLETION_ENDPOINT = "/v1/chat/completions"
MAX_COMPLETION_TOKENS = None
TEMPERATURE = 1 # Obs: gpt-5-nano does not support temperature=0
VERBOSITY = "low" # Options: "low", "medium", "high"
REASONING_EFFORT = "high" # Options: "low", "medium", "high"
ROWS_PER_BATCH = 5000 # Number of rows to process in each batch (will generate double the amount of calls due to executing two tasks: claim extraction and claim normalization)

### Create Auxiliary Directories for Processing

In [12]:
# Make sure all necessary directories exist
os.makedirs(UNPROCESSED_BATCHES_DIR, exist_ok=True)
os.makedirs(UPLOADED_BATCHES_DIR, exist_ok=True)
os.makedirs(PROCESSING_BATCHES_DIR, exist_ok=True)
os.makedirs(PROCESSED_BATCHES_DIR, exist_ok=True)
os.makedirs(RESULTS_BATCHES_DIR, exist_ok=True)
os.makedirs(FAILED_BATCHES_DIR, exist_ok=True)

### Load Original Data

In [13]:
# Initialize an empty DataFrame to hold the concatenated dataset
dataset_df = pd.DataFrame()

# Read all CSV files in the dataset directory and concatenate them into a single DataFrame
for file in ORIGINAL_DATASET_FILES:
    if file.endswith(".csv"):
        df = pd.read_csv(file)

        # Row source
        row_source = file.split("/")[-1].replace(".csv", "")  # e.g., train or test

        # If the custom_id column does not exist, create it and save the updated dataframe back to the csv file
        if "custom_id" not in df.columns:
            df["custom_id"] = (
                DATASET + "_" + row_source + "_" + df.index.astype(str)
            )  # Add a custom_id column to keep track of original row positions

            # Put the custom_id column at the front
            cols = df.columns.tolist()
            cols = [cols[-1]] + cols[:-1]  # Move custom_id to the front
            df = df[cols]

            # Save the updated dataframe back to the csv file
            df.to_csv(file, index=False)

        # Add a source column to identify the origin of each row
        df["source"] = row_source

        # Concatenate the current dataframe to the main dataset dataframe
        dataset_df = pd.concat([dataset_df, df], ignore_index=True)

# Keep only relevant columns
dataset_df = dataset_df[DATASET_IMPORTANT_COLUMNS + ["custom_id", "source"]] # Add custom id and source columns to keep track of original rows

# Display the first few rows of the concatenated dataset
logging.info(f"Total records: {len(dataset_df)}")
dataset_df.head()

2025-11-06 18:00:24,796 - INFO - Total records: 7200


Unnamed: 0,text,classificacao,custom_id,source
0,dia plateia domingao precisou atendimento medi...,fake,fakebr_train_0,train
1,unica ong df top epoca abrace ajuda mil pacien...,true,fakebr_train_1,train
2,renan parte pra guerra juizeco primeira instan...,fake,fakebr_train_2,train
3,presidente afastada dilma rousseff tentou cond...,true,fakebr_train_3,train
4,coreia norte critica sancoes disposta dialogar...,true,fakebr_train_4,train


### Create Tasks Jobs Batches

In [14]:
# Function to generate JSONL rows for a batch
def generate_batch_jsonl_rows(batch_df):
    batch_jsonl_rows = []

    for _, row in batch_df.iterrows():
        if "claim_extraction" in TASKS:
            claim_extraction_row_dict = {
								"custom_id": f"{row['custom_id']}_extr",
								"method": "POST",
								"url": COMPLETION_ENDPOINT,
								"body": {
										"model": MODEL,
										"messages": [
												{"role": "developer", "content": CLAIM_EXTRACTION_SYSTEM_MESSAGE},
												{
														"role": "user",
														"content": f"Postagem: {row['text']}\nDeclaração extraída:",
												},
										],
										"max_completion_tokens": MAX_COMPLETION_TOKENS,
										"metadata": {
												"custom_id": row["custom_id"],
												"source": row["source"],
												"classificacao": row["classificacao"],
												"task": "claim_extraction",
										},
										"verbosity": VERBOSITY,
										"reasoning_effort": REASONING_EFFORT,
										"temperature": TEMPERATURE,
										"store": True,
								},
						}

            batch_jsonl_rows.append(claim_extraction_row_dict)

        if "claim_normalization" in TASKS:
            claim_normalization_row_dict = {
								"custom_id": f"{row['custom_id']}_norm",
								"method": "POST",
								"url": COMPLETION_ENDPOINT,
								"body": {
										"model": MODEL,
										"messages": [
												{
														"role": "developer",
														"content": CLAIM_NORMALIZATION_SYSTEM_MESSAGE,
												},
												{
														"role": "user",
														"content": f"Postagem: {row['text']}\nDeclaração normalizada:",
												},
										],
										"max_completion_tokens": MAX_COMPLETION_TOKENS,
										"metadata": {
												"custom_id": row["custom_id"],
												"source": row["source"],
												"classificacao": row["classificacao"],
												"task": "claim_normalization",
										},
										"verbosity": VERBOSITY,
										"reasoning_effort": REASONING_EFFORT,
										"temperature": TEMPERATURE,
										"store": True,
								},
						}

            batch_jsonl_rows.append(claim_normalization_row_dict)

        if "claim_extraction_normalization" in TASKS:
            claim_extraction_normalization_row_dict = {
                "custom_id": f"{row['custom_id']}_extrnorm",
                "method": "POST",
                "url": COMPLETION_ENDPOINT,
                "body": {
                    "model": MODEL,
                    "messages": [
                        {
                            "role": "developer",
                            "content": CLAIM_EXTRACTION_NORMALIZATION_SYSTEM_MESSAGE,
                        },
                        {
                            "role": "user",
                            "content": f"Postagem: {row['text']}\nDeclaração normalizada:",
                        },
                    ],
                    "max_completion_tokens": MAX_COMPLETION_TOKENS,
                    "metadata": {
                        "custom_id": row["custom_id"],
                        "source": row["source"],
                        "classificacao": row["classificacao"],
                        "task": "claim_extraction_normalization",
                    },
                    "verbosity": VERBOSITY,
                    "reasoning_effort": REASONING_EFFORT,
                    "temperature": TEMPERATURE,
                    "store": True,
                },
            }

            batch_jsonl_rows.append(claim_extraction_normalization_row_dict)

    return batch_jsonl_rows


# Function to save JSONL file
def save_batch_jsonl_file(batch_jsonl_rows, batch_file_path):
    try:
        with open(batch_file_path, "w") as jsonl_file:
            for row in batch_jsonl_rows:
                jsonl_file.write(
                    json.dumps(row) + "\n"
                )  # Use json.dumps to format with double quotes
        logging.info(f"Saved batch to {batch_file_path}")
    except Exception as e:
        logging.error(f"Error saving batch to {batch_file_path}: {e}")


# Function to upload file to OpenAI
def upload_batch_file_to_openai(batch_file_path):
    try:
        batch_uploaded_file = client.files.create(
            file=open(batch_file_path, "rb"), purpose="batch"
        )
        logging.info(f"Uploaded batch to OpenAI successfully! File ID: {batch_uploaded_file.id}")
        return batch_uploaded_file.id
    except Exception as e:
        logging.error(f"Error uploading batch to OpenAI: {e}")
        return None

# Function to create a batch in OpenAI
def create_openai_batch(batch_input_file_id):
    try:
        batch_info = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": f"Batch created from file ID {batch_input_file_id}"},
        )
        logging.info(f"Created batch successfully! Batch ID: {batch_info.id}")
        return batch_info.id
    except Exception as e:
        logging.error(f"Error creating batch: {e}")
        return None

# Main batch processing loop
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
current_batch = 1

for i in range(0, len(dataset_df), ROWS_PER_BATCH):
    # Generate JSONL rows for the current batch
    batch_df = dataset_df.iloc[i:i + ROWS_PER_BATCH]
    batch_jsonl_rows = generate_batch_jsonl_rows(batch_df)

    # Save the batch to a JSONL file
    batch_file_name = f"batch_{current_batch}_{PROCESS_ID}.jsonl"
    batch_file_path = f"{UNPROCESSED_BATCHES_DIR}/{batch_file_name}"
    save_batch_jsonl_file(batch_jsonl_rows, batch_file_path)

    # Upload the batch file to OpenAI
    batch_input_file_id = upload_batch_file_to_openai(batch_file_path)

    if batch_input_file_id:
        # Move the batch file to the uploaded directory
        uploaded_file_path = f"{UPLOADED_BATCHES_DIR}/batch_{current_batch}_file-id_{batch_input_file_id}.jsonl"
        move_file_to_directory(batch_file_path, uploaded_file_path)

        # Create the batch in OpenAI
        batch_id = create_openai_batch(batch_input_file_id)

        if batch_id:
            # Move the batch file to the processing directory
            processing_file_path = f"{PROCESSING_BATCHES_DIR}/batch_{current_batch}_id_{batch_id.replace("batch_", "")}.jsonl"
            move_file_to_directory(uploaded_file_path, processing_file_path)

    # Increment the batch counter
    current_batch += 1

2025-11-06 18:00:25,254 - INFO - Saved batch to ../data/fakebr/batches/datasets_generation_jobs/unprocessed/gpt-5-nano_2025-11-06_18-00-24/batch_1_gpt-5-nano_2025-11-06_18-00-24.jsonl
2025-11-06 18:01:23,882 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
2025-11-06 18:01:23,885 - INFO - Uploaded batch to OpenAI successfully! File ID: file-F1EudvyJsmhkyMKmKVrw3M
2025-11-06 18:01:23,886 - INFO - Moved file to ../data/fakebr/batches/datasets_generation_jobs/uploaded/gpt-5-nano_2025-11-06_18-00-24/batch_1_file-id_file-F1EudvyJsmhkyMKmKVrw3M.jsonl
2025-11-06 18:01:24,499 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2025-11-06 18:01:24,500 - INFO - Created batch successfully! Batch ID: batch_690d0ca441388190a5ccf76c01a7a0bc
2025-11-06 18:01:24,501 - INFO - Moved file to ../data/fakebr/batches/datasets_generation_jobs/processing/gpt-5-nano_2025-11-06_18-00-24/batch_1_id_690d0ca441388190a5ccf76c01a7a0bc.jsonl
2025-11-06 18:01:24

### Check Batches Results

In [25]:
# Function to retrieve batch status from OpenAI
def retrieve_batch_status(batch_id):
    try:
        batch_object = client.batches.retrieve(f"batch_{batch_id}")
        batch_status = batch_object.status

        logging.info(f"Batch {batch_id} status: {batch_status}")

        if batch_status in ["completed", "failed"]:
            return batch_object
        elif batch_status in ["created", "in_progress", "finalizing", "validating"]:
            return None
        else:
            logging.warning(f"Batch {batch_id} has unexpected status: {batch_status}")
            return None
    except Exception as e:
        logging.error(f"Error retrieving batch {batch_id}: {e}")
        return None

# Function to process completed batches
def process_completed_batch(batch_id, batch_info, batch_processing_file):
    error_occurred = False

    if batch_info.output_file_id:
        try:
            results_response = client.files.content(batch_info.output_file_id)
            completed_file_path = f"{RESULTS_BATCHES_DIR}/{os.path.splitext(batch_processing_file)[0]}_results.jsonl"

            with open(
                completed_file_path, "wb"
            ) as result_file:  # Use "wb" for binary write
                result_file.write(
                    results_response.read()
                )  # Read binary content and write

            logging.info(f"Saved results for batch {batch_id} to {completed_file_path}")
        except Exception as e:
            logging.error(f"Error processing completed batch {batch_id}: {e}")
            error_occurred = True

    if batch_info.error_file_id:
        try:
            error_response = client.files.content(batch_info.error_file_id)
            failed_file_path = f"{FAILED_BATCHES_DIR}/{os.path.splitext(batch_processing_file)[0]}_errors.jsonl"

            with open(
                failed_file_path, "wb"
            ) as error_file:  # Use "wb" for binary write
                error_file.write(error_response.read())  # Read binary content and write

            logging.info(f"Saved errors for batch {batch_id} to {failed_file_path}")
        except Exception as e:
            logging.error(f"Error processing failed {batch_id}: {e}")
            error_occurred = True

    if not error_occurred:
        # Move the processing file to processed directory
        processed_file_path = f"{PROCESSED_BATCHES_DIR}/{batch_processing_file}"
        move_file_to_directory(
            f"{PROCESSING_BATCHES_DIR}/{batch_processing_file}", processed_file_path
        )
        logging.info(
            f"Moved processing file for batch {batch_id} to processed directory"
        )

# Function to get processing batches
def get_processing_batches():
    if not os.path.exists(PROCESSING_BATCHES_DIR):
        logging.warning(f"Processing directory {PROCESSING_BATCHES_DIR} does not exist.")
        return []

    processing_file_paths = os.listdir(PROCESSING_BATCHES_DIR)
    if not processing_file_paths:
        logging.info("No batches are currently being processed.")
        return []

    return processing_file_paths

# Main function to check on batches being processed
def check_batches_processing():
    processing_file_paths = get_processing_batches()

    if not processing_file_paths:
        logging.info("No batches are currently being processed.")
        return

    for batch_file in processing_file_paths:
        try:
            batch_id = batch_file.split('_id_')[1].replace('.jsonl', '')
            batch_info = retrieve_batch_status(batch_id)

            if batch_info:
                process_completed_batch(
                    batch_id,
                    batch_info,
                    batch_file
                )

        except Exception as e:
            logging.error(f"Error processing batch file {batch_file}: {e}")

# Call the batches processing check function
check_batches_processing()

2025-11-06 22:23:15,296 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_690d0c280b7c819090dee079ec9d69fd "HTTP/1.1 401 Unauthorized"
2025-11-06 22:23:15,297 - ERROR - Error retrieving batch 690d0c280b7c819090dee079ec9d69fd: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************kQcA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'code': 'invalid_api_key', 'param': None}}
2025-11-06 22:23:15,460 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_690d0c440c2081909b4438086270337d "HTTP/1.1 401 Unauthorized"
2025-11-06 22:23:15,461 - ERROR - Error retrieving batch 690d0c440c2081909b4438086270337d: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-*************************************************

### Save Tasks Datasets

In [None]:
# Get all files from errors directory and log them
error_files = os.listdir(FAILED_BATCHES_DIR)
if error_files:
    logging.warning(
        f"Some batches failed. Check the {FAILED_BATCHES_DIR} directory for details."
    )
else:
    logging.info(f"All batches in process {PROCESS_ID} completed successfully.")

# Load original dataset for reference
original_dataset_df = pd.DataFrame()
for file in ORIGINAL_DATASET_FILES:
    if file.endswith(".csv"):
        df = pd.read_csv(file)
        original_dataset_df = pd.concat([original_dataset_df, df], ignore_index=True)

# Get all files in the results directory
os.listdir(RESULTS_BATCHES_DIR)

if not os.listdir(RESULTS_BATCHES_DIR):
		logging.error(f"No result files found in {RESULTS_BATCHES_DIR}.")
else:
		# Load all batches into a single DataFrame
		all_results_df = pd.DataFrame()
		for result_file in os.listdir(RESULTS_BATCHES_DIR):
				if result_file.endswith(".jsonl"):
						result_file_path = os.path.join(RESULTS_BATCHES_DIR, result_file)

						try:
								# Initialize a list to hold rows
								rows = []

								# Read the JSONL file line by line
								with open(result_file_path, "r") as file:
										print(f"Reading results file: {result_file_path}")

										for line in file:
												# Parse each line as JSON
												json_data = json.loads(line)

												# Extract relevant information
												if "extrnorm" in json_data.get("custom_id", ""):
														task = "claim_extraction_normalization"
												elif "extr" in json_data.get("custom_id", ""):
														task = "claim_extraction"
												elif "norm" in json_data.get("custom_id", ""):
														task = "claim_normalization"
												else:
														logging.warning(f"Unknown task for custom_id: {json_data.get('custom_id')}")
														task = "unknown"
														continue

												original_custom_id = (
														json_data.get("custom_id")
														.replace("_extrnorm", "")
														.replace("_extr", "")
														.replace("_norm", "")
												)
												original_classificacao = (
														original_dataset_df[
																original_dataset_df["custom_id"] == original_custom_id
														]["classificacao"].values[0]
														if not original_dataset_df[
																original_dataset_df["custom_id"] == original_custom_id
														].empty
														else None
												)

												# Create a row with relevant information
												row = {
														"custom_id": original_custom_id,
														"text": json_data.get("response", {})
														.get("body", {})
														.get("choices", [{}])[0]
														.get("message", {})
														.get("content", None),
														"classificacao": original_classificacao,
														"source": "train" if "train" in original_custom_id else "test",
														"task": task,
												}

												# Skip rows with empty content
												if row["text"] is not None and row["classificacao"] is not None:

														# Append row to rows list
														rows.append(row)

								# Convert rows to DataFrame and concatenate to all_results_df
								batch_df = pd.json_normalize(rows)
								all_results_df = pd.concat([all_results_df, batch_df], ignore_index=True)

						except Exception as e:
								logging.error(f"Error reading results file {result_file_path}: {e}")

		# Separate by different tasks (claim extraction and claim normalization)
		extraction_df = all_results_df[all_results_df["task"] == "claim_extraction"].copy()
		normalization_df = all_results_df[
				all_results_df["task"] == "claim_normalization"
		].copy()
		extraction_normalization_df = all_results_df[
				all_results_df["task"] == "claim_extraction_normalization"
		].copy()
		extraction_df.drop(columns=["task"], inplace=True)
		normalization_df.drop(columns=["task"], inplace=True)
		extraction_normalization_df.drop(columns=["task"], inplace=True)

		# Create output directories
		extraction_output_path = f"{DATASET_PATH}/claim_extraction"
		normalization_output_path = f"{DATASET_PATH}/claim_normalization"
		extraction_normalization_output_path = f"{DATASET_PATH}/claim_extraction_normalization"

		os.makedirs(extraction_output_path, exist_ok=True)
		os.makedirs(normalization_output_path, exist_ok=True)
		os.makedirs(extraction_normalization_output_path, exist_ok=True)

		# Create a directory for each process
		os.makedirs(f"{extraction_output_path}/{PROCESS_ID}", exist_ok=True)
		os.makedirs(f"{normalization_output_path}/{PROCESS_ID}", exist_ok=True)
		os.makedirs(f"{extraction_normalization_output_path}/{PROCESS_ID}", exist_ok=True)

		# Split each DataFrame into train and test based on the source column
		extraction_train_df = extraction_df[extraction_df["source"] == "train"].drop(columns=["source"])
		extraction_test_df = extraction_df[extraction_df["source"] == "test"].drop(columns=["source"])
		normalization_train_df = normalization_df[normalization_df["source"] == "train"].drop(columns=["source"])
		normalization_test_df = normalization_df[normalization_df["source"] == "test"].drop(columns=["source"])
		extraction_normalization_train_df = extraction_normalization_df[extraction_normalization_df["source"] == "train"].drop(columns=["source"])
		extraction_normalization_test_df = extraction_normalization_df[extraction_normalization_df["source"] == "test"].drop(columns=["source"])

		# Save train and test CSVs
		extraction_train_df.to_csv(
				f"{extraction_output_path}/{PROCESS_ID}/train.csv", index=False
		)
		extraction_test_df.to_csv(
				f"{extraction_output_path}/{PROCESS_ID}/test.csv", index=False
		)
		normalization_train_df.to_csv(
				f"{normalization_output_path}/{PROCESS_ID}/train.csv", index=False
		)
		normalization_test_df.to_csv(
				f"{normalization_output_path}/{PROCESS_ID}/test.csv", index=False
		)
		extraction_normalization_train_df.to_csv(
				f"{extraction_normalization_output_path}/{PROCESS_ID}/train.csv", index=False
		)
		extraction_normalization_test_df.to_csv(
				f"{extraction_normalization_output_path}/{PROCESS_ID}/test.csv", index=False
		)



Reading results file: ../data/fakebr/batches/datasets_generation_jobs/results/gpt-5-nano_2025-11-06_18-00-24/batch_1_id_690d0ca441388190a5ccf76c01a7a0bc_results.jsonl
Reading results file: ../data/fakebr/batches/datasets_generation_jobs/results/gpt-5-nano_2025-11-06_18-00-24/batch_2_id_690d0cc1bf748190b986c065565d686b_results.jsonl
