In [1]:
from openai import OpenAI
from dotenv import load_dotenv
from generated_prompt import prompt_template
import json
import time
import os

load_dotenv()  # loads variables from .env, including OpenAI key
# Initialize OpenAI client
client = OpenAI()

In [24]:
def ensure_data_folder(folder="data"):
    """Ensure the data folder exists."""
    os.makedirs(folder, exist_ok=True)
    return folder

def generate_batch(start: int, end: int, schema_prompt_template: str):
    """Generate a batch of answers using the model and parse string-wrapped JSON."""
      
    completion = client.chat.completions.create(
        model="gpt-5",
        messages=[{"role": "user", "content": schema_prompt_template}],
        # You can leave response_format commented out if you want raw string
    )
    
    batch_answers = completion.choices[0].message.content
    
    # Try parsing if it's a string containing JSON
    if isinstance(batch_answers, str):
        try:
            parsed = json.loads(batch_answers)
            if isinstance(parsed, list):
                return parsed
            elif isinstance(parsed, dict) and "data" in parsed:
                return parsed["data"]
            else:
                return [parsed]
        except json.JSONDecodeError:
            print(f"Warning: batch {start}-{end} could not be parsed as JSON, wrapping in list")
            return [batch_answers]
    
    # Already a list (rare)
    elif isinstance(batch_answers, list):
        return batch_answers
    
    # Other unexpected format
    else:
        print(f"Warning: batch {start}-{end} returned unexpected format, wrapping in list")
        return [batch_answers]

def generate_all_answers(total_answers: int, batch_size: int, schema_prompt_template: str):
    """Generate all answers in batches and return a combined list."""
    all_answers = []
    for start in range(1, total_answers + 1, batch_size):
        end = min(start + batch_size - 1, total_answers)
        batch = generate_batch(start, end, schema_prompt_template)
        all_answers.extend(batch)
        print(f"Batch {start}-{end} done, total collected: {len(all_answers)}")
    return all_answers



def save_answers_to_file(answers, filename="data/answers.json"):
    """Save or append answers to a JSON file."""
    folder = os.path.dirname(filename)
    ensure_data_folder(folder)

    # Load existing answers if file exists
    if os.path.exists(filename):
        try:
            with open(filename, "r", encoding="utf-8") as f:
                existing_answers = json.load(f)
            if not isinstance(existing_answers, list):
                print(f"Warning: {filename} is not a list. Overwriting file.")
                existing_answers = []
        except json.JSONDecodeError:
            print(f"Warning: {filename} is invalid JSON. Overwriting file.")
            existing_answers = []
    else:
        existing_answers = []

    # Append new answers
    combined_answers = existing_answers + answers

    # Save back to file
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(combined_answers, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(answers)} new answers. Total answers now: {len(combined_answers)} to {filename}")


def is_valid_json_file(filename: str) -> bool:
    """
    Check if the file contains proper JSON.
    Returns True if valid, False otherwise.
    """
    try:
        with open(filename, "r", encoding="utf-8") as f:
            json.load(f)
        return True
    except (json.JSONDecodeError, FileNotFoundError) as e:
        print(f"Invalid JSON or file error: {e}")
        return False



if __name__ == "__main__":
    total_answers = 2
    batch_size = 2
    schema_prompt_template = prompt_template(batch_size)
    
    all_answers = generate_all_answers(total_answers, batch_size, schema_prompt_template)
    save_answers_to_file(all_answers, "data/synthetic_database.json")



    # Example usage
    filename = "data/synthetic_database.json"
    if is_valid_json_file(filename):
        print(f"{filename} is valid JSON")
    else:
        print(f"{filename} is NOT valid JSON")
    


Batch 1-2 done, total collected: 2
Saved 2 new answers. Total answers now: 4 to data/synthetic_database.json
data/synthetic_database.json is valid JSON


In [2]:
filename = "data/synthetic_database.json"

try:
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, list):
        print(f"The file {filename} has {len(data)} elements.")
    else:
        print("Warning: JSON is not a list. Counting as 1 element.")
        print(f"The file {filename} has 1 element.")
except FileNotFoundError:
    print(f"{filename} does not exist.")
except json.JSONDecodeError:
    print(f"{filename} is not valid JSON.")

The file data/synthetic_database.json has 792 elements.
