In [15]:
import os
import re
import json
import time

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

## Load Dataset

In [None]:
def load_data(file_path: str) -> list:
    """
    Load dataset from a JSON file.

    Args:
        file_path (str): Path to the dataset JSON file.

    Returns:
        list: Loaded dataset as a list of dictionaries.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

# File path
DATA_FILE = " "

# Load dataset
data = load_data(DATA_FILE)
print(f"Dataset Loaded: {len(data)} samples.")

## Load API Key & Initialize Client

In [None]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
print("API Key Loaded and Client Initialized.")

## Load Prompts

In [None]:
def load_prompts(file_path: str, category: str) -> dict:
    """
    Load prompts for a specific category from a JSON file.

    Args:
        file_path (str): Path to the prompts JSON file.
        category (str): The category key (e.g., 'eng_abbreviations', 'kor_abbreviations').

    Returns:
        dict: Dictionary containing different prompt templates.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        prompt_data = json.load(file)

    return prompt_data.get(category, {})

# File path and category for prompts
PROMPT_FILE = " "  
CATEGORY = " "  

# Load prompts
prompts = load_prompts(PROMPT_FILE, CATEGORY)
print(f"Loaded prompts for category '{CATEGORY}': {list(prompts.keys())}")

## Generate JSONL Files

In [None]:
def generate_jsonl_files(data: list, prompts: dict, output_dir: str):
    """
    Generate JSONL files for different prompt types.

    Args:
        data (list): Dataset to be used for requests.
        prompts (dict): Dictionary containing prompts for each type.
        output_dir (str): Directory to save JSONL files.
    """
    os.makedirs(output_dir, exist_ok=True)

    for prompt_type, prompt_text in prompts.items():
        jsonl_path = os.path.join(output_dir, f"{prompt_type}.jsonl")

        with open(jsonl_path, "w", encoding="utf-8") as outfile:
            system_message = {"role": "system", "content": prompt_text}

            for idx, item in enumerate(data):
                request_data = {
                    "custom_id": f"{prompt_type}-request-{idx + 1}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "gpt-4o",
                        "messages": [
                            system_message,
                            {"role": "user", "content": item["transformed"]}
                        ]
                    }
                }
                outfile.write(json.dumps(request_data, ensure_ascii=False) + "\n")
        print(f"JSONL file created: {jsonl_path}")

# File paths
OUTPUT_DIR = "jsonl"

# Generate JSONL files
generate_jsonl_files(data, prompts, OUTPUT_DIR)
print("JSONL Files Generated.")

## Send Requests to API

In [None]:
def send_batch_requests(client, jsonl_files: list):
    """
    Send batch requests for multiple JSONL files.

    Args:
        client: OpenAI API client.
        jsonl_files (list): List of JSONL file paths.

    Returns:
        dict: Mapping of batch IDs to prompt types.
    """
    batch_ids = {}

    for jsonl_file in jsonl_files:
        batch_input_file = client.files.create(
            file=open(jsonl_file, "rb"),
            purpose="batch"
        )

        batch_input_file_id = batch_input_file.id

        batch_response = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": f"Batch request for {os.path.basename(jsonl_file)}"}
        )

        batch_ids[jsonl_file] = batch_response.id
        print(f"Batch request sent for {jsonl_file}. Batch ID: {batch_response.id}")

    return batch_ids

# Get JSONL file paths
jsonl_files = [os.path.join(OUTPUT_DIR, f"{category}.jsonl") for category in prompts.keys()]

# Send batch requests
batch_ids = send_batch_requests(client, jsonl_files)
print(f"Batch Requests Sent: {batch_ids}")

## Retrieve Batch Results

In [None]:
def fetch_batch_results(client, batch_ids: dict):
    """
    Retrieve batch results from OpenAI API using output_file_id.

    Args:
        client: OpenAI API client.
        batch_ids (dict): Mapping of JSONL filenames to batch IDs.

    Returns:
        dict: Mapping of JSONL filenames to response data.
    """
    results = {}

    for jsonl_file, batch_id in batch_ids.items():
        batch_info = client.batches.retrieve(batch_id)
        status = batch_info.status
        output_file_id = batch_info.output_file_id  

        print(f"Checking batch status for {jsonl_file}: {status}")

        if status == "completed" and output_file_id:
            try:
                file_response = client.files.content(output_file_id)
                results[jsonl_file] = file_response.text
                print(f"Results retrieved for {jsonl_file}")

            except Exception as e:
                print(f"Error fetching results for {jsonl_file}: {e}")
        else:
            print(f"Batch {batch_id} is not completed or has no output file.")

    return results

# Fetch batch results
results = fetch_batch_results(client, batch_ids)
print("Batch Results Retrieved.")

## Save Results to JSON

In [None]:
# Extract Answer Based on Category
def extract_answer(response_text, category):
    """
    Extracts the answer from response text based on the category prefix.

    Args:
        response_text (str): Full response text from GPT4o API.
        category (str): Prompt category (e.g., "eng_abbreviations" or "kor_abbreviations").

    Returns:
        str: Extracted answer or full response if no match is found.
    """
    if category.startswith("eng_"):
        match = re.search(r"Answer:\s*(.*)", response_text)
    else:
        match = re.search(r"답변:\s*(.*)", response_text)

    return match.group(1).strip() if match else response_text  # Extracted answer or full response

# Map Custom ID to Original Input
def map_custom_id_to_input(data):
    """
    Creates a mapping from custom_id to the original transformed input word.

    Args:
        data (list): List of input data containing "transformed" words.

    Returns:
        dict: Mapping from custom_id to transformed input word.
    """
    return {f"request-{idx + 1}": item["transformed"] for idx, item in enumerate(data)}

# Save Processed Results
def save_results(results: dict, output_dir: str, data: list, category: str):
    """
    Saves processed results into a JSON file.

    Args:
        results (list): List of dictionaries containing "word", "response", and "answer".
        filename (str): Path to the output JSON file.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Create mapping from custom_id to original input word
    id_to_input_map = map_custom_id_to_input(data)

    for jsonl_file, response_text in results.items():
        lines = response_text.strip().split("\n")
        response_data = [json.loads(line) for line in lines]
        
        processed_data = []
        for entry in response_data:
            custom_id = entry.get("custom_id")  
            response_content = entry["response"]["body"]["choices"][0]["message"]["content"] if entry.get("response") else ""

            # Extract input word from mapping
            prompt_prefix = custom_id.split("-request")[0]  
            clean_custom_id = custom_id.replace(f"{prompt_prefix}-", "")  
            input_text = id_to_input_map.get(clean_custom_id, "Unknown")

            # Extract answer using appropriate format
            extracted_answer = extract_answer(response_content, category)

            processed_data.append({
                "word": input_text,  # Store original transformed word
                "response": response_content,  # Store full response
                "answer": extracted_answer  # Store extracted answer
            })

        # Save results as JSON
        json_output_path = os.path.join(output_dir, f"{os.path.basename(jsonl_file)}.json")
        with open(json_output_path, "w", encoding="utf-8") as file:
            json.dump(processed_data, file, ensure_ascii=False, indent=4)

        print(f"Results saved to {json_output_path}")

# Save results
RESULTS_DIR = "results"
save_results(results, RESULTS_DIR, data, CATEGORY)
print("Results Saved Successfully!")