In [1]:
import requests
import pandas as pd

# Your Census API key
API_KEY = "bc59ed2032734716cd1e00a6b7b8e0921df40c24"

In [2]:
import requests
from collections import defaultdict

url = "https://api.census.gov/data/2023/acs/acs1"

# Define variables for different races and age groups
age_groups = ['07', '08', '09', '10', '11', '12', '13', '14', '15', '16']
age_group_names = {
    '07': '18-19', '08': '20-24', '09': '25-29', '10': '30-34',
    '11': '35-44', '12': '45-54', '13': '55-64', '14': '65-74',
    '15': '75-84', '16': '85+'
}

races = {
    'White': 'H', 'Black': 'B', 'American Indian and Alaska Native': 'C',
    'Asian': 'D', 'Native Hawaiian and Pacific Islander': 'E',
    'Two or more races': 'G', 'Hispanic or Latino': 'I'
}

# Generate variables for API query
variables = {}
for race, code in races.items():
    variables[race] = []
    for age in age_groups:
        variables[race].extend([f'B01001{code}_{int(age):03d}E', f'B01001{code}_{int(age)+15:03d}E'])

# Store the results in this dictionary
population_data = defaultdict(lambda: defaultdict(dict))

# Query the Census API for each race and store the results
for race, age_vars in variables.items():
    # Define the API parameters
    params = {
        "get": ",".join(age_vars) + ",NAME",  # Query for the age variables and state names
        "for": "state:*",  # Get data for all states
        "key": API_KEY
    }
    
    # Make the API request
    response = requests.get(url, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        try:
            data = response.json()
            columns = data[0]  # The header row
            rows = data[1:]  # The data rows
            
            # Process the results
            for row in rows:
                state_name = row[-2]  # State name
                for i, var in enumerate(age_vars):
                    age_code = age_groups[i // 2]
                    age_group = age_group_names[age_code]
                    gender = "male" if i % 2 == 0 else "female"
                    num_people = int(row[i]) if row[i] else 0
                    
                    key = (age_group, gender, race)
                    population_data[state_name][key] = num_people
        except requests.JSONDecodeError:
            print(f"Error decoding JSON for {race} variables: {age_vars}")
            print("Response text:", response.text)  # Print the raw response for debugging
    else:
        print(f"Error: {response.status_code} for race {race} variables {age_vars}")
        print("Response text:", response.text)  # Print the raw response for debugging


In [3]:
import json
import os
import torch
from collections import defaultdict
import us
import tqdm

def process_state_data(state_name, state_data):
    state = us.states.lookup(state_name)
    if state is None:
        print(f"Warning: '{state_name}' is not a valid US state name. Skipping.")
        return

    state_abbr = state.abbr
    
    # Convert state_data to a list of tuples and a list of probabilities
    items, counts = zip(*state_data.items())
    probs = torch.tensor(counts, dtype=torch.float)
    probs /= probs.sum()

    # Create a Categorical distribution
    distribution = torch.distributions.Categorical(probs)

    def sample_persona():
        index = distribution.sample().item()
        age, sex, race = items[index]
        return {
            "AGE": age,
            "SEX": sex.capitalize(),
            "RACE": race,
            "STATE": state_abbr
        }

    def save_persona(persona, index):
        folder_path = f"./LLM_Simulation/Persona/Persona_Meta_Joint/{state_abbr}/meta_persona"
        os.makedirs(folder_path, exist_ok=True)
        file_path = os.path.join(folder_path, f"persona_{index}.json")
        data = {"PERSONA": persona}
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)

    num_personas = 1000  # Adjust as needed
    for i in range(num_personas):
        persona = sample_persona()
        save_persona(persona, i)

    print(f"Generated and saved {num_personas} personas for state {state_name} ({state_abbr})")

for state_name, state_data in tqdm.tqdm(population_data.items(), desc="Processing states"):
    process_state_data(state_name, state_data)

Processing states:   2%|▏         | 1/52 [00:02<02:25,  2.85s/it]

Generated and saved 1000 personas for state Alabama (AL)


Processing states:   4%|▍         | 2/52 [00:05<02:06,  2.54s/it]

Generated and saved 1000 personas for state Alaska (AK)


Processing states:   6%|▌         | 3/52 [00:07<02:02,  2.50s/it]

Generated and saved 1000 personas for state Arizona (AZ)


Processing states:   8%|▊         | 4/52 [00:10<02:00,  2.51s/it]

Generated and saved 1000 personas for state Arkansas (AR)


Processing states:  10%|▉         | 5/52 [00:12<01:57,  2.50s/it]

Generated and saved 1000 personas for state California (CA)


Processing states:  12%|█▏        | 6/52 [00:15<01:54,  2.49s/it]

Generated and saved 1000 personas for state Colorado (CO)


Processing states:  13%|█▎        | 7/52 [00:17<01:53,  2.53s/it]

Generated and saved 1000 personas for state Connecticut (CT)


Processing states:  15%|█▌        | 8/52 [00:20<01:51,  2.54s/it]

Generated and saved 1000 personas for state Delaware (DE)


Processing states:  19%|█▉        | 10/52 [00:22<01:19,  1.89s/it]

Generated and saved 1000 personas for state Florida (FL)


Processing states:  21%|██        | 11/52 [00:25<01:23,  2.03s/it]

Generated and saved 1000 personas for state Georgia (GA)


Processing states:  23%|██▎       | 12/52 [00:27<01:25,  2.15s/it]

Generated and saved 1000 personas for state Hawaii (HI)


Processing states:  25%|██▌       | 13/52 [00:30<01:27,  2.26s/it]

Generated and saved 1000 personas for state Idaho (ID)


Processing states:  27%|██▋       | 14/52 [00:32<01:28,  2.32s/it]

Generated and saved 1000 personas for state Illinois (IL)


Processing states:  29%|██▉       | 15/52 [00:34<01:26,  2.34s/it]

Generated and saved 1000 personas for state Indiana (IN)


Processing states:  31%|███       | 16/52 [00:37<01:25,  2.37s/it]

Generated and saved 1000 personas for state Iowa (IA)


Processing states:  33%|███▎      | 17/52 [00:39<01:22,  2.36s/it]

Generated and saved 1000 personas for state Kansas (KS)


Processing states:  35%|███▍      | 18/52 [00:41<01:19,  2.33s/it]

Generated and saved 1000 personas for state Kentucky (KY)


Processing states:  37%|███▋      | 19/52 [00:44<01:16,  2.33s/it]

Generated and saved 1000 personas for state Louisiana (LA)


Processing states:  38%|███▊      | 20/52 [00:46<01:14,  2.33s/it]

Generated and saved 1000 personas for state Maine (ME)


Processing states:  40%|████      | 21/52 [00:48<01:12,  2.33s/it]

Generated and saved 1000 personas for state Maryland (MD)


Processing states:  42%|████▏     | 22/52 [00:51<01:09,  2.31s/it]

Generated and saved 1000 personas for state Massachusetts (MA)


Processing states:  44%|████▍     | 23/52 [00:53<01:06,  2.28s/it]

Generated and saved 1000 personas for state Michigan (MI)


Processing states:  46%|████▌     | 24/52 [00:55<01:03,  2.28s/it]

Generated and saved 1000 personas for state Minnesota (MN)


Processing states:  48%|████▊     | 25/52 [00:57<01:01,  2.26s/it]

Generated and saved 1000 personas for state Mississippi (MS)


Processing states:  50%|█████     | 26/52 [01:00<00:58,  2.27s/it]

Generated and saved 1000 personas for state Missouri (MO)


Processing states:  52%|█████▏    | 27/52 [01:02<00:55,  2.23s/it]

Generated and saved 1000 personas for state Montana (MT)


Processing states:  54%|█████▍    | 28/52 [01:04<00:53,  2.22s/it]

Generated and saved 1000 personas for state Nebraska (NE)


Processing states:  56%|█████▌    | 29/52 [01:06<00:51,  2.22s/it]

Generated and saved 1000 personas for state Nevada (NV)


Processing states:  58%|█████▊    | 30/52 [01:09<00:50,  2.28s/it]

Generated and saved 1000 personas for state New Hampshire (NH)


Processing states:  60%|█████▉    | 31/52 [01:11<00:47,  2.27s/it]

Generated and saved 1000 personas for state New Jersey (NJ)


Processing states:  62%|██████▏   | 32/52 [01:13<00:45,  2.26s/it]

Generated and saved 1000 personas for state New Mexico (NM)


Processing states:  63%|██████▎   | 33/52 [01:16<00:43,  2.28s/it]

Generated and saved 1000 personas for state New York (NY)


Processing states:  65%|██████▌   | 34/52 [01:18<00:40,  2.27s/it]

Generated and saved 1000 personas for state North Carolina (NC)


Processing states:  67%|██████▋   | 35/52 [01:20<00:38,  2.25s/it]

Generated and saved 1000 personas for state North Dakota (ND)


Processing states:  69%|██████▉   | 36/52 [01:22<00:36,  2.26s/it]

Generated and saved 1000 personas for state Ohio (OH)


Processing states:  71%|███████   | 37/52 [01:24<00:33,  2.25s/it]

Generated and saved 1000 personas for state Oklahoma (OK)


Processing states:  73%|███████▎  | 38/52 [01:27<00:31,  2.24s/it]

Generated and saved 1000 personas for state Oregon (OR)


Processing states:  75%|███████▌  | 39/52 [01:29<00:29,  2.24s/it]

Generated and saved 1000 personas for state Pennsylvania (PA)


Processing states:  77%|███████▋  | 40/52 [01:31<00:27,  2.26s/it]

Generated and saved 1000 personas for state Rhode Island (RI)


Processing states:  79%|███████▉  | 41/52 [01:34<00:24,  2.26s/it]

Generated and saved 1000 personas for state South Carolina (SC)


Processing states:  81%|████████  | 42/52 [01:36<00:22,  2.28s/it]

Generated and saved 1000 personas for state South Dakota (SD)


Processing states:  83%|████████▎ | 43/52 [01:38<00:20,  2.28s/it]

Generated and saved 1000 personas for state Tennessee (TN)


Processing states:  85%|████████▍ | 44/52 [01:41<00:18,  2.34s/it]

Generated and saved 1000 personas for state Texas (TX)


Processing states:  87%|████████▋ | 45/52 [01:43<00:16,  2.36s/it]

Generated and saved 1000 personas for state Utah (UT)


Processing states:  88%|████████▊ | 46/52 [01:45<00:14,  2.38s/it]

Generated and saved 1000 personas for state Vermont (VT)


Processing states:  90%|█████████ | 47/52 [01:48<00:12,  2.42s/it]

Generated and saved 1000 personas for state Virginia (VA)


Processing states:  92%|█████████▏| 48/52 [01:50<00:09,  2.46s/it]

Generated and saved 1000 personas for state Washington (WA)


Processing states:  94%|█████████▍| 49/52 [01:53<00:07,  2.55s/it]

Generated and saved 1000 personas for state West Virginia (WV)


Processing states:  96%|█████████▌| 50/52 [01:56<00:05,  2.53s/it]

Generated and saved 1000 personas for state Wisconsin (WI)


Processing states:  98%|█████████▊| 51/52 [01:58<00:02,  2.54s/it]

Generated and saved 1000 personas for state Wyoming (WY)


Processing states: 100%|██████████| 52/52 [02:01<00:00,  2.33s/it]

Generated and saved 1000 personas for state Puerto Rico (PR)





In [11]:
import os
import json
from collections import defaultdict

def verify_marginals(state_abbr):
    folder_path = f"./LLM_Simulation/Persona/Persona_State_Ultimate/meta_persona/{state_abbr}"
    
    # Counters for each category
    sex_count = defaultdict(int)
    age_count = defaultdict(int)
    race_count = defaultdict(int)
    total_count = 0

    # Read all persona files
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), 'r') as f:
                persona = json.load(f)
                sex_count[persona['SEX']] += 1
                age_count[persona['AGE']] += 1
                race_count[persona['RACE']] += 1
                total_count += 1

    # Calculate and print percentages
    print(f"Marginal distributions for {state_abbr}:")
    print("\nSex distribution:")
    for sex, count in sex_count.items():
        percentage = (count / total_count) * 100
        print(f"{sex}: {percentage:.2f}%")

    print("\nAge group distribution:")
    for age, count in age_count.items():
        percentage = (count / total_count) * 100
        print(f"{age}: {percentage:.2f}%")

    print("\nRace distribution:")
    for race, count in race_count.items():
        percentage = (count / total_count) * 100
        print(f"{race}: {percentage:.2f}%")

# Example usage
verify_marginals("NY")

Marginal distributions for NY:

Sex distribution:
Female: 49.80%
Male: 50.20%

Age group distribution:
55-64: 17.20%
45-54: 14.60%
35-44: 16.80%
20-24: 7.00%
75-84: 9.60%
65-74: 13.20%
25-29: 9.60%
85+: 2.40%
30-34: 8.20%
18-19: 1.40%

Race distribution:
White: 49.80%
Hispanic or Latino: 18.80%
Black: 10.60%
American Indian and Alaska Native: 0.80%
Asian: 9.80%
Two or more races: 10.20%


In [3]:
import os
import hashlib
import requests
from collections import defaultdict
def calculate_file_hash(file_path):
    """Calculate the SHA-256 hash of a file."""
    hasher = hashlib.sha256()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

def find_duplicate_files(folder_path):
    """Find and print duplicate files in the given folder."""
    file_hashes = defaultdict(list)
    
    # Calculate hash for each file and store it in a dictionary
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            file_hash = calculate_file_hash(file_path)
            file_hashes[file_hash].append(filename)
    
    # Print files with the same hash
    for file_list in file_hashes.values():
        if len(file_list) > 1:
            print(f"Duplicate files: {file_list}")

# Specify the folder path
ny_folder_path = "./LLM_Simulation/Persona/Persona_State_Ultimate/NY/meta_persona"

# Find and print duplicate files
find_duplicate_files(ny_folder_path)

Duplicate files: ['persona_267.json', 'persona_497.json', 'persona_232.json', 'persona_94.json', 'persona_817.json', 'persona_523.json', 'persona_261.json', 'persona_23.json', 'persona_353.json', 'persona_960.json', 'persona_143.json', 'persona_819.json', 'persona_836.json', 'persona_563.json', 'persona_704.json', 'persona_748.json', 'persona_848.json', 'persona_635.json', 'persona_690.json', 'persona_465.json', 'persona_648.json', 'persona_313.json', 'persona_234.json', 'persona_705.json', 'persona_504.json', 'persona_515.json', 'persona_652.json', 'persona_779.json', 'persona_585.json', 'persona_918.json', 'persona_433.json', 'persona_167.json', 'persona_979.json', 'persona_165.json', 'persona_924.json', 'persona_422.json', 'persona_901.json', 'persona_99.json', 'persona_160.json', 'persona_231.json', 'persona_488.json', 'persona_385.json', 'persona_962.json', 'persona_244.json', 'persona_606.json', 'persona_790.json', 'persona_199.json', 'persona_697.json', 'persona_351.json']
Dupli

In [3]:

persona_type = "marginal_factual_persona"
if persona_type.startswith("marginal"):
    print("yes")

yes
