In [12]:
import json
import os
import glob
import us
from tqdm import tqdm

def process_personas(template_path, input_dir, output_dir, state, num_personas):
    def fill_template(template, data):
        filled = {}
        for key, value in template.items():
            if isinstance(value, dict):
                filled[key] = fill_template(value, data.get(key, {}))
            elif isinstance(value, list):
                # For list fields, use data if available, otherwise keep empty list
                data_value = data.get(key, [])
                if isinstance(data_value, str):
                    filled[key] = [item.strip() for item in data_value.split(",") if item.strip()]
                elif isinstance(data_value, list):
                    filled[key] = data_value
                else:
                    filled[key] = []
            else:
                # For string fields, use data if available, otherwise keep empty string
                filled[key] = str(data.get(key, "")).strip() or value
        return filled

    # Load the template
    with open(template_path, "r") as template_file:
        template = json.load(template_file)

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i in tqdm(range(num_personas)):
        persona_file = os.path.join(input_dir, f"{state.abbr}_persona_{i}.json")
        if not os.path.exists(persona_file):
            print(f"File not found: {persona_file}")
            continue
        
        # Load the actual persona data
        with open(persona_file, "r") as pf:
            try:
                final_persona = json.load(pf)
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file: {persona_file}")
                continue

        persona_data = final_persona.get("PERSONA", {})
        
        if isinstance(persona_data, str):
            # If it's a string, parse it into a dictionary
            persona_dict = {}
            for item in persona_data.split("', '"):
                parts = item.split(": ")
                if len(parts) == 2:
                    key = parts[0].strip().strip("'{}").upper()
                    value = parts[1].strip().strip("'")
                    if value:  # Only add non-empty values
                        persona_dict[key] = value
        elif isinstance(persona_data, dict):
            # If it's already a dictionary, use it directly
            persona_dict = persona_data
        else:
            print(f"PERSONA field is neither a string nor a dictionary in file: {persona_file}")
            continue

        # Fill the template with persona data
        filled_persona = fill_template(template, persona_dict)

        # Save the filled template as a new JSON file
        output_file = os.path.join(output_dir, f"{state.abbr}_persona_{i}_readable.json")
        with open(output_file, "w") as f:
            json.dump(filled_persona, f, indent=2)

        # print(f"Filled template saved to: {output_file}")

    print(f"All persona files processed for {state.name}.")

# Example usage:
template_path = "/user/al4263/Simulate/Persona/prompts/persona_generation/persona_template.json"
input_dir = "/user/al4263/Simulate/Persona/Persona_Meta_Based/TX"
output_dir = "/user/al4263/Simulate/Persona/persona_meta_human_readable"

num_personas = 1000  # Adjust this number as needed
state = us.states.TX

process_personas(template_path, input_dir, output_dir, state, num_personas)

100%|██████████| 1000/1000 [00:03<00:00, 275.90it/s]

All persona files processed for Texas.



