In [None]:
import openai
import json
import os
import pandas as pd
import time
import random # Added for random sampling if needed, though pandas.sample handles it

# --------------------
# 0) Configuration & Constants
# --------------------
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY_POPSYNTH')
if not OPENAI_API_KEY:
    raise ValueError("Please set the environment variable OPENAI_API_KEY_POPSYNTH")
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Define the path to your source CSV file
SOURCE_CSV_PATH = "data/h_sample-PE.csv"

# --- Mapping Dictionaries ---
GENDER_MAP = {1: "Male", 2: "Female"}
HOMEINCOME_MAP = {
    1: "less than 1 million Won",
    2: "between 1 million Won and 3 million Won",
    3: "between 3 million Won and 5 million Won",
    4: "between 5 million Won and 10 million Won",
    5: "more than 10 million Won",
}
HOMETYPE_MAP = {
    1: "Apartment", 2: "Villa", 3: "Multi-family house",
    4: "Single-family house", 5: "Studio-type residence", 6: "Other"
}
CAROWN_MAP = {1: "Yes", 2: "No"}
DRIVER_MAP = {1: "Yes", 2: "No"}
WORKDAYS_MAP = {
    1: "5 days", 2: "6 days", 3: "1-4 days", 4: "Inoccupation/non-regular"
}
WORKTYPE_MAP = {
    1: "Student", 2: "Inoccupation/Housewife", 3: "Experts", 4: "Service",
    5: "Sales", 6: "Manager/Office", 7: "Agriculture and fisher",
    8: "Simple labor", 9: "Others"
}
STUDENT_MAP = {
    1: "Elementary/Middle/High School", 2: "Pre-school Child",
    3: "University", 4: "Not a Student"
}
KIDINHH_MAP = {1: "Yes", 2: "No"}
COMMODE_MAP = {
    "Car": "Car", "Public Transportation": "Public Transportation",
    "None(did not travel)": "None(did not travel)", "Walking": "Walking",
    "Bike/Bicycle": "Bike/Bicycle", "Taxi": "Taxi"
}
COMTIME_MAP = {
    "Peak": "Peak (05:00–8:00)", "Non-Peak": "Non-Peak (09:00–12:00)",
    "Other": "Other than peak/non-peak", "None(did not travel)": "None(did not travel)"
}

# Expected columns in the CSV - adjust if your CSV has different names
EXPECTED_COLUMNS = [
    "Age", "Gender", "Homeincome", "Hometype", "CarOwn", "Driver",
    "Workdays", "Worktype", "Student", "NumHH", "KidinHH", "ComMode", "ComTime"
]

# --------------------
# 1) Function Schema Definition
# --------------------
FUNCTION_SCHEMA = {
    "name": "generate_synthetic_population",
    "description": "Generates synthetic population profiles of South Korea following codebook definitions",
    "parameters": {
        "type": "object",
        "required": ["profiles"],
        "properties": {
            "profiles": {
                "type": "array",
                "items": {
                    "type": "object",
                    "required": [
                        "Gender", "Age", "Homeincome", "Hometype",
                        "CarOwn", "Driver", "Workdays", "Worktype",
                        "Student", "NumHH", "KidinHH", "ComMode", "ComTime"
                    ],
                    "properties": {
                        "Gender": {"type": "integer", "enum": [1, 2], "description": "1: Male, 2: Female"},
                        "Age": {
                            "type": "string",
                            "pattern": "^(?:\\[\\d+,\\d+\\)|\\[85,90\\])$",
                            "description": "Reported `Age Group` [start,end) format, 5-year increments. Final range [85,90]."
                        },
                        "Homeincome": {"type": "integer", "enum": [1, 2, 3, 4, 5], "description": "Household monthly Income level (KRW). 1: <1M, 2: 1-3M, 3: 3-5M, 4: 5-10M, 5: >10M"},
                        "Hometype": {"type": "integer", "enum": [1, 2, 3, 4, 5, 6], "description": "Home Type. 1: Apt, 2: Villa, 3: Multi-family, 4: Single-family, 5: Studio, 6: Other"},
                        "CarOwn": {"type": "integer", "enum": [1, 2], "description": "Household Car Ownership: 1: Yes, 2: No"},
                        "Driver": {"type": "integer", "enum": [1, 2], "description": "Has Driver License: 1: Yes, 2: No"},
                        "Workdays": {"type": "integer", "enum": [1, 2, 3, 4], "description": "Working Days/week. 1: 5days, 2: 6days, 3: 1-4days, 4: Inoccupation/non-regular"},
                        "Worktype": {"type": "integer", "enum": [1, 2, 3, 4, 5, 6, 7, 8, 9], "description": "Working Type. 1: Student, 2: Inoccupation/Housewife, 3: Experts, 4: Service, 5: Sales, 6: Manager/Office, 7: Agri/fisher, 8: Simple labor, 9: Others"},
                        "Student": {"type": "integer", "enum": [1, 2, 3, 4], "description": "Education Status. 1: Elem/Mid/High, 2: Pre-school, 3: University, 4: Not Student"},
                        "NumHH": {"type": "integer", "minimum": 1, "maximum": 7, "description": "Number of Household Members (1–7)"},
                        "KidinHH": {"type": "integer", "enum": [1, 2], "description": "Presence of Kid in Household: 1: Yes, 2: No"},
                        "ComMode": {"type": "string", "enum": ["Car", "Public Transportation", "None(did not travel)", "Walking", "Bike/Bicycle", "Taxi"], "description": "Major Travel Mode for 'Regular Travel' (commute, school, etc.)"},
                        "ComTime": {"type": "string", "enum": ["Peak", "Non-Peak", "Other", "None(did not travel)"], "description": "Major Departure Time for 'Regular Travel'. Peak(5-8), Non-Peak(9-12), Other, None."}
                    },
                    "additionalProperties": False
                }
            }
        },
        "additionalProperties": False
    }
}

# --------------------
# 2) Helper Functions for Few-Shot Example Generation
# --------------------
def convert_row_to_sentence(row: pd.Series) -> str:
    """
    Converts a DataFrame row (representing one person) into a descriptive sentence
    using the predefined mapping dictionaries. Handles potential errors during mapping.
    """
    try:
        # Attempt to map each field, using .get() for safety or direct access with try-except
        age_str = str(row["Age"]) # Age is usually a string already '[X,Y)'
        gender_str = GENDER_MAP.get(int(row["Gender"]), "Unknown Gender")
        homeincome_str = HOMEINCOME_MAP.get(int(row["Homeincome"]), "Unknown Income")
        hometype_str = HOMETYPE_MAP.get(int(row["Hometype"]), "Unknown Home Type")
        carown_str = CAROWN_MAP.get(int(row["CarOwn"]), "Unknown Car Ownership")
        driver_str = DRIVER_MAP.get(int(row["Driver"]), "Unknown Driver Status")
        workdays_str = WORKDAYS_MAP.get(int(row["Workdays"]), "Unknown Workdays")
        worktype_str = WORKTYPE_MAP.get(int(row["Worktype"]), "Unknown Worktype")
        student_str = STUDENT_MAP.get(int(row["Student"]), "Unknown Student Status")
        numhh_str = str(row["NumHH"])
        kidinhh_str = KIDINHH_MAP.get(int(row["KidinHH"]), "Unknown Kid Status")
        commode_str = COMMODE_MAP.get(str(row["ComMode"]), "Unknown Commute Mode") # Ensure ComMode is string
        comtime_str = COMTIME_MAP.get(str(row["ComTime"]), "Unknown Commute Time") # Ensure ComTime is string

        sentence = (
            f"Gender is {gender_str}, "
            f"The Respondent's Reported Age Group is {age_str}, "
            f"Education enrollment status is {student_str}, "
            f"Working Type is {worktype_str}, "
            f"Usual Working Days per week is {workdays_str}, "
            f"Driver License status is {driver_str}, \n"
            f"Monthly Household Income is {homeincome_str}, "
            f"The number of Household Members is {numhh_str}, "
            f"The presence of a Kid in the Household is {kidinhh_str}, "
            f"Home Type is {hometype_str}, "
            f"The Household Car Ownership is {carown_str}, \n"
            f"Major Travel Mode of the Respondent's Regular Travel is {commode_str}, "
            f"Major Departure Time of the Respondent's Regular Travel is {comtime_str}."
        )
        return sentence
    except (KeyError, ValueError, TypeError) as e:
        print(f"--- Warning: Error converting row to sentence: {e}. Row data: {row.to_dict()}")
        # Return a placeholder or skip this row in the calling function
        return f"Error processing example: {e}"


def generate_dynamic_few_shot_examples(source_df: pd.DataFrame, num_examples: int) -> str:
    """
    Randomly samples rows from the source DataFrame, converts them to sentences,
    and formats them as a string for the API prompt.
    """
    if num_examples <= 0:
        return ""
    if num_examples > len(source_df):
         print(f"--- Warning: Requested {num_examples} examples, but only {len(source_df)} available in source data. Using all available.")
         num_examples = len(source_df)

    # Randomly sample WITHOUT a fixed random_state for variability
    sampled_df = source_df.sample(n=num_examples) # No random_state

    example_texts = []
    for i, (_, row) in enumerate(sampled_df.iterrows(), 1):
        sentence = convert_row_to_sentence(row)
        # Only include successfully converted sentences
        if not sentence.startswith("Error processing example"):
             # Using "Example" consistently now, but kept original format just in case
            example_texts.append(f"-----\nExample {i}:\n{sentence}")

    return '\n'.join(example_texts)

# --------------------
# 3) JSON Salvaging Function
# --------------------
def salvage_profiles_from_incomplete_json(arguments_str):
    """
    Attempts to extract complete profile JSON objects from a potentially incomplete JSON string.
    """
    profiles = []
    processed_str = arguments_str

    # Basic preprocessing: Find the start of the profiles array
    try:
        start_idx = processed_str.find('"profiles":[')
        if start_idx >= 0:
            start_idx += len('"profiles":[')
            processed_str = processed_str[start_idx:]
            first_brace = processed_str.find('{')
            if first_brace >= 0:
                processed_str = processed_str[first_brace:]
            else:
                 print("Salvage Preprocessing: No starting '{' found after '\"profiles\":['.")
                 processed_str = ""
        else:
             # If "profiles" key isn't found, maybe the structure is different?
             # For now, assume it starts with { if it's just one profile, or [{ if multiple
             processed_str = processed_str.strip()
             if processed_str.startswith('['):
                 first_brace = processed_str.find('{')
                 if first_brace >= 0:
                      processed_str = processed_str[first_brace:]
                 else: processed_str = ""
             elif not processed_str.startswith('{'):
                 print("Salvage Preprocessing: String doesn't start with expected JSON object/array structure.")
                 processed_str = ""

    except Exception as e:
        print(f"Salvage Preprocessing Error: {e}")
        processed_str = ""

    # Manual extraction based on balanced braces
    if processed_str:
        try:
            idx = 0
            while idx < len(processed_str):
                if processed_str[idx] == '{':
                    start = idx
                    brace_count = 1
                    idx += 1
                    # Find matching closing brace
                    while idx < len(processed_str) and brace_count > 0:
                        if processed_str[idx] == '{':
                            brace_count += 1
                        elif processed_str[idx] == '}':
                            brace_count -= 1
                        idx += 1

                    # If braces match, try parsing this segment
                    if brace_count == 0:
                        profile_str = processed_str[start:idx]
                        try:
                            profile = json.loads(profile_str)
                            # Validate required fields (using the list defined earlier)
                            if all(field in profile for field in FUNCTION_SCHEMA["parameters"]["properties"]["profiles"]["items"]["required"]):
                                profiles.append(profile)
                            # else:
                            #     print(f"Salvaged object skipped (missing required fields): {profile_str[:100]}...") # Debugging
                        except json.JSONDecodeError:
                            # print(f"Salvaged object skipped (JSONDecodeError): {profile_str[:100]}...") # Debugging
                            pass # Skip malformed JSON segments
                    else:
                        # Incomplete object found at the end
                        # print(f"Salvage stopped: Incomplete JSON object found starting at index {start}.") # Debugging
                        break # Stop processing
                else:
                    idx += 1 # Move to the next character if not starting brace
        except Exception as e:
            print(f"Manual object extraction error during salvage: {e}")

    # print(f"Salvage attempt result: Found {len(profiles)} potentially complete profiles.") # Debugging
    return profiles


# --------------------
# 4) Synthetic Population API Call Function
# --------------------
def call_gpt_function_call_batch(batch_size: int, source_df: pd.DataFrame, num_examples_to_generate: int):
    """
    Generates dynamic few-shot examples and calls the GPT-4 function calling API
    to create a batch of synthetic population profiles.

    Args:
        batch_size: Number of profiles to request in this batch.
        source_df: DataFrame containing the source data for sampling examples.
        num_examples_to_generate: How many few-shot examples to sample and generate.
    """

    # ** Dynamically generate few-shot examples for this specific call **
    print(f"--- Generating {num_examples_to_generate} dynamic few-shot examples for this batch...")
    few_shot_examples = generate_dynamic_few_shot_examples(source_df, num_examples_to_generate)
    if not few_shot_examples:
        print("--- Warning: No few-shot examples generated. Proceeding without them.")

    # Construct the system message using the dynamically generated examples
    system_message_content = f"""
# Your Role:
You are an experienced demographic data scientist specialized in generating synthetic population data representative of South Korean demographics.

# Your mission:
- Produce a JSON function call to the function "generate_synthetic_population", providing realistic synthetic population profiles that reflect typical demographic characteristics of South Korea.
- Each profile must strictly adhere to the codebook constraints described below, without omitting any required fields.

# Function: generate_synthetic_population
- Parameter:
    "profiles": an array of objects, each describing one person's Socio-Demographic and Travel-Related attributes
- Each object must include these properties (strictly follow data types and enums/patterns):
    ## Socio-Demographic attributes - Individual Level
    1) Gender (integer enum: 1=Male, 2=Female)
    2) Age (string pattern: "[start,end)" – using 5-year increments, last group "[85,90]")
    3) Student (integer enum: 1-4)
    4) Worktype (integer enum: 1-9)
    5) Workdays (integer enum: 1-4)
    6) Driver (integer enum: 1=Yes, 2=No)

    ## Socio-Demographic attributes - Household Level
    7) Homeincome (integer enum: 1-5)
    8) NumHH (integer range: 1-7)
    9) KidinHH (integer enum: 1=Yes, 2=No)
    10) Hometype (integer enum: 1-6)
    11) CarOwn (integer enum: 1=Yes, 2=No)

    ## Travel-Related attributes
    12) ComMode (string enum: "Car", "Public Transportation", "None(did not travel)", "Walking", "Bike/Bicycle", "Taxi")
    13) ComTime (string enum: "Peak", "Non-Peak", "Other", "None(did not travel)")

# Please carefully follow these rules:
- Do not add extra fields outside the required ones.
- Ensure values strictly match their defined enums, data types, and patterns.
- Firstly, Analyze the provided Few-shot Examples to capture characteristics of the Korean population regarding:
    (1) "Demographic Consistency", (2) "Household Structure Consistency", (3) "Work Situation Realism", and (4) "Travel Behavior Plausibility"
- Then, generate each profile's JSON, ensuring it is not only **feasible** according to the schema but also **realistic and varied**, reflecting the patterns observed in the examples.
- Think carefully and logically to generate population, referring to the few-shot examples.

# Few-shot Examples:
- These {num_examples_to_generate} examples are sampled from actual survey data (South Korean HTS).
- Use these examples primarily to learn realistic correlations and distributions for the generation, guided by the four perspectives.

<few_shot_examples>
{few_shot_examples}
</few_shot_examples>
    """

    system_message = {"role": "system", "content": system_message_content}
    user_message = {"role": "user", "content": f"Generate {batch_size} synthetic profiles."}

    # API Call
    try:
        print(f">>> Calling OpenAI API (requesting {batch_size} profiles)...")
        response = openai_client.chat.completions.create(
            model="gpt-4o", # Or your preferred model
            messages=[system_message, user_message],
            temperature=0.3, # Slightly increased temperature for more variety based on examples
            tools=[{"type": "function", "function": FUNCTION_SCHEMA}],
            tool_choice={"type": "function", "function": {"name": "generate_synthetic_population"}},
            # Consider adding a timeout (e.g., timeout=180 for 3 minutes)
        )
        print("<<< OpenAI API call finished.")
    except Exception as api_error:
        print(f"!!! OpenAI API call failed: {api_error}")
        return None # API call itself failed

    # Response Processing
    if not response or not response.choices:
        print("!!! Received invalid or empty response from API.")
        return None

    message = response.choices[0].message
    tool_calls = message.tool_calls

    if tool_calls:
        tool_call = tool_calls[0] # Assume only one tool call as requested
        if tool_call.function.name == "generate_synthetic_population":
            raw_arguments = tool_call.function.arguments
            try:
                # Attempt to parse the full JSON
                parsed_args = json.loads(raw_arguments)
                num_parsed = len(parsed_args.get('profiles', []))
                print(f"Successfully parsed full JSON response ({num_parsed} profiles).")
                # Basic validation: Check if 'profiles' key exists and is a list
                if 'profiles' in parsed_args and isinstance(parsed_args['profiles'], list):
                     return parsed_args
                else:
                     print("!!! Parsed JSON is missing 'profiles' list. Attempting salvage.")
                     # Fall through to salvage attempt
                     raise json.JSONDecodeError("Missing 'profiles' list", raw_arguments, 0)


            except json.JSONDecodeError as e:
                print(f"--- Failed to parse full JSON: {e}. Attempting to salvage profiles...")
                salvaged_profiles = salvage_profiles_from_incomplete_json(raw_arguments)

                if salvaged_profiles:
                    print(f"--- Successfully salvaged {len(salvaged_profiles)} profiles from incomplete JSON.")
                    return {"profiles": salvaged_profiles}
                else:
                    print("--- Salvage attempt failed or found no complete profiles.")
                    return None # Salvage failed
            except Exception as other_e: # Catch other potential errors during processing
                print(f"!!! Error processing function call arguments: {other_e}")
                return None
        else:
            print(f"!!! API returned unexpected function call: {tool_call.function.name}")
            return None # Wrong function called
    elif message.content:
        print(f"!!! API returned text content instead of function call: {message.content[:200]}...")
        return None # API didn't use the function
    else:
        print("!!! No tool calls or content found in the API response.")
        return None # Empty/unrecognized response


# --------------------
# 5) Batch Processing Loop
# --------------------
def generate_large_population(total_profiles: int, batch_size: int, source_df: pd.DataFrame, num_examples_per_batch: int, output_prefix: str):
    """
    Generates a large number of population profiles in batches, saving intermediate results.

    Args:
        total_profiles: Total number of profiles to generate.
        batch_size: Max profiles to request per API call.
        source_df: DataFrame with source data for generating few-shot examples.
        num_examples_per_batch: Number of few-shot examples to generate for each batch.
        output_prefix: Prefix for saving intermediate and final files (e.g., "generated_population").
    """
    all_profiles = {"profiles": []}
    remaining = total_profiles
    batch_count = 0
    current_batch_size_tracker = batch_size # Tracks adaptive batch size

    while remaining > 0:
        batch_count += 1
        current_batch_size = min(current_batch_size_tracker, remaining)

        print(f"\n===== Generating Batch {batch_count} =====")
        print(f"Target for this batch: {current_batch_size} profiles (using {num_examples_per_batch} examples)")

        # Call API for the batch, passing the source DataFrame
        result = call_gpt_function_call_batch(current_batch_size, source_df, num_examples_per_batch)

        if result and "profiles" in result and isinstance(result["profiles"], list):
            generated_count = len(result["profiles"])
            if generated_count > 0:
                all_profiles["profiles"].extend(result["profiles"])
                remaining -= generated_count # Decrease remaining by actual count generated
                total_generated = len(all_profiles["profiles"])

                print(f"+++ Added {generated_count} profiles. Total: {total_generated}/{total_profiles} (Remaining: {remaining})")

                # Reset batch size tracker upon success? Optional, could speed up if failures were temporary
                # current_batch_size_tracker = batch_size

                # --- Intermediate Save Logic ---
                # Save every 5 batches OR when crossing a 500-profile milestone OR if it's the last batch (implicitly covered by loop ending)
                if batch_count % 5 == 0 or (total_generated // 500 > (total_generated - generated_count) // 500):
                    temp_json_path = f"{output_prefix}_intermediate_{total_generated}.json"
                    try:
                        with open(temp_json_path, "w", encoding="utf-8") as f:
                            json.dump(all_profiles, f, ensure_ascii=False, indent=2)
                        print(f"--- Saved intermediate result to {temp_json_path}")
                    except IOError as e:
                        print(f"!!! Error saving intermediate file {temp_json_path}: {e}")

                # --- Delay ---
                time.sleep(1) # Brief pause between successful calls

            else:
                # API call succeeded but returned 0 profiles
                print(f"--- Batch {batch_count} generated 0 profiles despite successful API call. Retrying...")
                time.sleep(5) # Wait longer before retrying if 0 profiles returned

        else:
            # API call failed, returned invalid data, or salvage failed
            print(f"!!! Failed to generate batch {batch_count} or received invalid response.")
            # Reduce batch size significantly for the next attempt
            current_batch_size_tracker = max(10, current_batch_size_tracker // 2) # Min batch size of 10
            print(f"--- Reduced batch size for next attempt to: {current_batch_size_tracker}")
            time.sleep(10) # Longer pause after a failure

    print("\n===== Generation Loop Finished =====")
    return all_profiles


# --------------------
# 6) Save Final Results Function
# --------------------
def save_final_results(final_data, total_target: int, output_prefix: str):
    """Saves the final generated profiles to JSON and CSV."""

    if not final_data or "profiles" not in final_data or not final_data["profiles"]:
        print("!!! No valid final data to save.")
        return

    actual_profiles = final_data["profiles"]
    actual_count = len(actual_profiles)

    # Ensure we don't save more than requested (if generation overshot somehow)
    if actual_count > total_target:
        print(f"--- Note: Generated {actual_count} profiles, trimming to target {total_target}.")
        actual_profiles = actual_profiles[:total_target]
        actual_count = len(actual_profiles)
        final_data = {"profiles": actual_profiles} # Update dict to save correctly

    json_path = f"{output_prefix}_final_{actual_count}.json"
    csv_path = f"{output_prefix}_final_{actual_count}.csv"

    print(f"\n--- Saving final {actual_count} profiles ---")

    # (A) Save JSON
    try:
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(final_data, f, ensure_ascii=False, indent=2)
        print(f"Saved JSON to {json_path}")
    except IOError as e:
        print(f"!!! Error saving final JSON file {json_path}: {e}")

    # (B) Convert to DataFrame and Save CSV
    try:
        df = pd.DataFrame(actual_profiles)
        # Reorder columns based on expected schema
        df = df[EXPECTED_COLUMNS] # Use the list defined earlier
        df.to_csv(csv_path, index=False, encoding="utf-8-sig") # utf-8-sig for Excel compatibility
        print(f"Saved CSV to {csv_path}")
    except KeyError as e:
         print(f"!!! Error creating CSV: Missing column {e}. CSV might be incomplete or incorrect.")
         # Optionally save with available columns:
         # pd.DataFrame(actual_profiles).to_csv(csv_path, index=False, encoding="utf-8-sig")
         # print(f"Saved CSV to {csv_path} with available columns.")
    except Exception as e:
        print(f"!!! Error saving final CSV file {csv_path}: {e}")


# --------------------
# 7) Continue from Intermediate Function
# --------------------
def continue_from_intermediate(intermediate_json_path: str, total_profiles: int, batch_size: int, source_df: pd.DataFrame, num_examples_per_batch: int, output_prefix: str):
    """
    Loads profiles from an intermediate JSON file and continues generation.
    """
    all_profiles = {"profiles": []}
    current_count = 0

    # --- Load Existing Data ---
    if os.path.exists(intermediate_json_path):
        try:
            with open(intermediate_json_path, "r", encoding="utf-8") as f:
                loaded_data = json.load(f)
            # Validate loaded data structure
            if isinstance(loaded_data, dict) and "profiles" in loaded_data and isinstance(loaded_data["profiles"], list):
                all_profiles = loaded_data
                current_count = len(all_profiles["profiles"])
                print(f"--- Successfully loaded {current_count} profiles from {intermediate_json_path}")
            else:
                print(f"!!! Invalid format in {intermediate_json_path}. Starting from scratch.")
                all_profiles = {"profiles": []} # Reset
                current_count = 0
        except json.JSONDecodeError:
            print(f"!!! Invalid JSON in {intermediate_json_path}. Starting from scratch.")
            all_profiles = {"profiles": []}
            current_count = 0
        except Exception as e:
            print(f"!!! Error loading intermediate file {intermediate_json_path}: {e}. Starting from scratch.")
            all_profiles = {"profiles": []}
            current_count = 0
    else:
        print(f"--- Intermediate file {intermediate_json_path} not found. Starting from scratch.")

    # --- Check if More Profiles Needed ---
    if current_count >= total_profiles:
        print(f"Already have {current_count} profiles (target was {total_profiles}). No need to generate more.")
        return all_profiles # Return loaded data

    # --- Continue Generation ---
    remaining = total_profiles - current_count
    print(f"--- Continuing generation for the remaining {remaining} profiles...")

    batch_count = (current_count // batch_size) # Estimate starting batch number
    current_batch_size_tracker = batch_size # Reset adaptive batch size for continuation

    while remaining > 0:
        batch_count += 1
        current_batch_size = min(current_batch_size_tracker, remaining)

        print(f"\n===== Generating Batch {batch_count} (Continuation) =====")
        print(f"Target for this batch: {current_batch_size} profiles (using {num_examples_per_batch} examples)")

        # Call API for the batch
        result = call_gpt_function_call_batch(current_batch_size, source_df, num_examples_per_batch)

        if result and "profiles" in result and isinstance(result["profiles"], list):
            generated_count = len(result["profiles"])
            if generated_count > 0:
                all_profiles["profiles"].extend(result["profiles"])
                # Update counts *after* extending
                new_total_generated = len(all_profiles["profiles"])
                remaining = total_profiles - new_total_generated # Recalculate remaining
                print(f"+++ Added {generated_count} profiles. Total: {new_total_generated}/{total_profiles} (Remaining: {remaining})")

                # --- Intermediate Save Logic (same as in generate_large_population) ---
                if batch_count % 5 == 0 or (new_total_generated // 500 > current_count // 500): # Check milestone crossing
                    temp_json_path = f"{output_prefix}_intermediate_{new_total_generated}.json"
                    try:
                        with open(temp_json_path, "w", encoding="utf-8") as f:
                            json.dump(all_profiles, f, ensure_ascii=False, indent=2)
                        print(f"--- Saved intermediate result to {temp_json_path}")
                    except IOError as e:
                        print(f"!!! Error saving intermediate file {temp_json_path}: {e}")

                # Update current_count for the next milestone check
                current_count = new_total_generated

                time.sleep(1) # Brief pause

            else:
                print(f"--- Batch {batch_count} generated 0 profiles despite successful API call. Retrying...")
                time.sleep(5) # Wait longer

        else:
            # Failure
            print(f"!!! Failed to generate batch {batch_count} or received invalid response.")
            current_batch_size_tracker = max(10, current_batch_size_tracker // 2)
            print(f"--- Reduced batch size for next attempt to: {current_batch_size_tracker}")
            time.sleep(10) # Longer pause after failure

    print("\n===== Continuation Loop Finished =====")
    return all_profiles


# --------------------
# 8) Main Execution Block
# --------------------
if __name__ == "__main__":
    # --- Configuration ---
    TARGET_TOTAL_PROFILES = 5000    # Total number of profiles to generate
    BATCH_SIZE = 200                 # Initial number of profiles per API call (will adapt on failure)
    NUM_FEW_SHOT_EXAMPLES = 150      # Number of dynamic examples per API call
    OUTPUT_FILE_PREFIX = "dynamic_population" # Prefix for output JSON/CSV files
    INTERMEDIATE_FILE_TO_CONTINUE = "dynamic_population_intermediate_10102.json" # Set to None or "" to always start fresh

    print("===== Synthetic Population Generation Script =====")
    print(f"Target: {TARGET_TOTAL_PROFILES} profiles")
    print(f"Initial Batch Size: {BATCH_SIZE}")
    print(f"Few-shot Examples per Batch: {NUM_FEW_SHOT_EXAMPLES}")
    print(f"Source CSV: {SOURCE_CSV_PATH}")
    print(f"Output Prefix: {OUTPUT_FILE_PREFIX}")


    # --- Load Source Data for Few-Shot Examples ---
    try:
        print(f"\n--- Loading source data from {SOURCE_CSV_PATH}...")
        # Assuming the CSV has a header row (header=0)
        # keep_default_na=False helps if empty strings are meaningful
        source_dataframe = pd.read_csv(SOURCE_CSV_PATH, header=0, keep_default_na=False, dtype=str) # Read all as string initially for flexibility
         # Basic Validation: Check if expected columns exist
        missing_cols = [col for col in EXPECTED_COLUMNS if col not in source_dataframe.columns]
        if missing_cols:
             raise ValueError(f"Source CSV is missing required columns: {', '.join(missing_cols)}")

        # Attempt to convert potentially numeric columns needed for mapping back to numeric types
        # This is important because the mapping dictionaries use integer keys
        numeric_cols = ["Gender", "Homeincome", "Hometype", "CarOwn", "Driver", "Workdays", "Worktype", "Student", "NumHH", "KidinHH"]
        for col in numeric_cols:
             if col in source_dataframe.columns:
                  # errors='coerce' will turn uncastable values into NaN, which might need handling later
                  # or use errors='raise' to stop if conversion fails
                  source_dataframe[col] = pd.to_numeric(source_dataframe[col], errors='coerce')
                  # Optional: Handle NaNs created by coercion if necessary, e.g., fillna or dropna
                  if source_dataframe[col].isnull().any():
                      print(f"--- Warning: Column '{col}' contained non-numeric values after loading. Coerced to NaN.")
                      # Example: Drop rows with NaN in critical fields if needed
                      # source_dataframe.dropna(subset=[col], inplace=True)

        print(f"Loaded {len(source_dataframe)} rows from source data.")

        if len(source_dataframe) < NUM_FEW_SHOT_EXAMPLES:
             print(f"--- Warning: Source data ({len(source_dataframe)} rows) is smaller than the requested number of few-shot examples ({NUM_FEW_SHOT_EXAMPLES}). Will use all available rows.")
             NUM_FEW_SHOT_EXAMPLES = len(source_dataframe) # Adjust dynamically


    except FileNotFoundError:
        print(f"!!! FATAL ERROR: Source CSV file not found at {SOURCE_CSV_PATH}")
        exit() # Stop execution if source data is missing
    except ValueError as ve:
         print(f"!!! FATAL ERROR: Problem with source CSV data: {ve}")
         exit()
    except Exception as e:
        print(f"!!! FATAL ERROR: Failed to load or process source CSV {SOURCE_CSV_PATH}: {e}")
        exit()


    # --- Start or Continue Generation ---
    final_population_data = None
    if INTERMEDIATE_FILE_TO_CONTINUE and os.path.exists(INTERMEDIATE_FILE_TO_CONTINUE):
        print(f"\n--- Attempting to continue generation from: {INTERMEDIATE_FILE_TO_CONTINUE} ---")
        final_population_data = continue_from_intermediate(
            intermediate_json_path=INTERMEDIATE_FILE_TO_CONTINUE,
            total_profiles=TARGET_TOTAL_PROFILES,
            batch_size=BATCH_SIZE,
            source_df=source_dataframe,
            num_examples_per_batch=NUM_FEW_SHOT_EXAMPLES,
            output_prefix=OUTPUT_FILE_PREFIX
        )
    else:
        if INTERMEDIATE_FILE_TO_CONTINUE:
             print(f"\n--- Intermediate file not found ({INTERMEDIATE_FILE_TO_CONTINUE}). Starting new generation. ---")
        else:
             print("\n--- Starting new generation (no intermediate file specified). ---")

        final_population_data = generate_large_population(
            total_profiles=TARGET_TOTAL_PROFILES,
            batch_size=BATCH_SIZE,
            source_df=source_dataframe,
            num_examples_per_batch=NUM_FEW_SHOT_EXAMPLES,
            output_prefix=OUTPUT_FILE_PREFIX
        )

    # --- Save Final Results ---
    if final_population_data and "profiles" in final_population_data:
        final_count = len(final_population_data["profiles"])
        if final_count >= TARGET_TOTAL_PROFILES:
            print(f"\n==== Generation Complete. Achieved target of {TARGET_TOTAL_PROFILES} (generated {final_count}). ====")
        else:
             print(f"\n==== Generation Finished. Generated {final_count} profiles (target was {TARGET_TOTAL_PROFILES}). ====")

        save_final_results(final_population_data, TARGET_TOTAL_PROFILES, OUTPUT_FILE_PREFIX)
    else:
        print("\n==== Failed to generate population data or the final result was invalid. ====")

    print("===== Script Finished =====")

===== Synthetic Population Generation Script =====
Target: 5000 profiles
Initial Batch Size: 200
Few-shot Examples per Batch: 150
Source CSV: h_sample-PE.csv
Output Prefix: dynamic_population

--- Loading source data from h_sample-PE.csv...
Loaded 53315 rows from source data.

--- Intermediate file not found (dynamic_population_intermediate_10102.json). Starting new generation. ---

===== Generating Batch 1 =====
Target for this batch: 200 profiles (using 150 examples)
--- Generating 150 dynamic few-shot examples for this batch...
>>> Calling OpenAI API (requesting 200 profiles)...
<<< OpenAI API call finished.
Successfully parsed full JSON response (20 profiles).
+++ Added 20 profiles. Total: 20/5000 (Remaining: 4980)

===== Generating Batch 2 =====
Target for this batch: 200 profiles (using 150 examples)
--- Generating 150 dynamic few-shot examples for this batch...
>>> Calling OpenAI API (requesting 200 profiles)...
<<< OpenAI API call finished.
--- Failed to parse full JSON: Unterm