In [27]:
import requests
import pandas as pd

# Update the URL to fetch DP02 data
data_url = "https://api.census.gov/data/2023/acs/acs1/profile?get=group(DP02)&ucgid=0400000US36"
response = requests.get(data_url)
raw_data = response.json()

metadata_url = "https://api.census.gov/data/2023/acs/acs1/profile/variables.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

# Update the mapping to filter for DP02 variables
dp02_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('DP02'):  # Only DP02 group variables
        dp02_mapping[var_code] = var_info['label']

variable_codes = raw_data[0]
values = raw_data[1]

data_df = pd.DataFrame({
    "Variable Code": variable_codes,
    "Value": values
})

data_df['Label'] = data_df['Variable Code'].map(dp02_mapping)

data_df = data_df[['Label', 'Value']].dropna()

In [49]:
import json
state = "ny"

with open(f"/user/al4263/Simulate/Persona/data/sex_race_age/formatted_data/{state}_structured_data.json", 'r') as file:
    data = json.load(file)

total_population = data["SEX AND AGE"]["Total population"]

In [138]:
import pandas as pd

def create_structured_data(df):
    def get_value(label):
        return df[df["Label"] == label]["Value"].values[0]

    def calculate_percentage(part, whole):
        return round((float(part) / float(whole)) * 100, 2)

    total_households = int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households"))
    
    result = {
        "HOUSEHOLDS BY TYPE": {
            "Total households": total_households,
            "Family households": {
                "Married-couple": {
                    "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Married-couple household")),
                    "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Married-couple household")),
                    "With children under 18": {
                        "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Married-couple household!!With children of the householder under 18 years")),
                        "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Married-couple household!!With children of the householder under 18 years"))
                    }
                },
                "Cohabiting couple": {
                    "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Cohabiting couple household")),
                    "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Cohabiting couple household")),
                    "With children under 18": {
                        "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Cohabiting couple household!!With children of the householder under 18 years")),
                        "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Cohabiting couple household!!With children of the householder under 18 years"))
                    }
                }
            },
            "Nonfamily households": {
                "Male householder": {
                    "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Male householder, no spouse/partner present")),
                    "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Male householder, no spouse/partner present")),
                    "With children under 18": {
                        "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Male householder, no spouse/partner present!!With children of the householder under 18 years")),
                        "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Male householder, no spouse/partner present!!With children of the householder under 18 years"))
                    }
                },
                "Female householder": {
                    "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Female householder, no spouse/partner present")),
                    "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Female householder, no spouse/partner present")),
                    "With children under 18": {
                        "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Female householder, no spouse/partner present!!With children of the householder under 18 years")),
                        "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Female householder, no spouse/partner present!!With children of the householder under 18 years"))
                    }
                }
            },
            "Households with children under 18": {
                "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Households with one or more people under 18 years")),
                "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Households with one or more people under 18 years"))
            },
            "Households with adults 65 and over": {
                "count": int(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Households with one or more people 65 years and over")),
                "percentage": float(get_value("Percent!!HOUSEHOLDS BY TYPE!!Total households!!Households with one or more people 65 years and over"))
            },
            "Average household size": float(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Average household size")),
            "Average family size": float(get_value("Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Average family size"))
        },
        "RELATIONSHIP": {
            "Total": int(get_value("Estimate!!RELATIONSHIP!!Population in households")),
            "Householder": {
                "count": int(get_value("Estimate!!RELATIONSHIP!!Population in households!!Householder")),
                "percentage": float(get_value("Percent!!RELATIONSHIP!!Population in households!!Householder"))
            },
            "Spouse": {
                "count": int(get_value("Estimate!!RELATIONSHIP!!Population in households!!Spouse")),
                "percentage": float(get_value("Percent!!RELATIONSHIP!!Population in households!!Spouse"))
            },
            "Unmarried partner": {
                "count": int(get_value("Estimate!!RELATIONSHIP!!Population in households!!Unmarried partner")),
                "percentage": float(get_value("Percent!!RELATIONSHIP!!Population in households!!Unmarried partner"))
            },
            "Child": {
                "count": int(get_value("Estimate!!RELATIONSHIP!!Population in households!!Child")),
                "percentage": float(get_value("Percent!!RELATIONSHIP!!Population in households!!Child"))
            },
            "Other relatives": {
                "count": int(get_value("Estimate!!RELATIONSHIP!!Population in households!!Other relatives")),
                "percentage": float(get_value("Percent!!RELATIONSHIP!!Population in households!!Other relatives"))
            },
            "Other nonrelatives": {
                "count": int(get_value("Estimate!!RELATIONSHIP!!Population in households!!Other nonrelatives")),
                "percentage": float(get_value("Percent!!RELATIONSHIP!!Population in households!!Other nonrelatives"))
            }
        },
        "MARITAL STATUS": {
            "Males": {
                "Total": int(get_value("Estimate!!MARITAL STATUS!!Males 15 years and over")),
                "Never married": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Males 15 years and over!!Never married")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Males 15 years and over!!Never married"))
                },
                "Now married, except separated": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Males 15 years and over!!Now married, except separated")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Males 15 years and over!!Now married, except separated"))
                },
                "Separated": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Males 15 years and over!!Separated")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Males 15 years and over!!Separated"))
                },
                "Widowed": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Males 15 years and over!!Widowed")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Males 15 years and over!!Widowed"))
                },
                "Divorced": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Males 15 years and over!!Divorced")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Males 15 years and over!!Divorced"))
                }
            },
            "Females": {
                "Total": int(get_value("Estimate!!MARITAL STATUS!!Females 15 years and over")),
                "Never married": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Females 15 years and over!!Never married")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Females 15 years and over!!Never married"))
                },
                "Now married, except separated": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Females 15 years and over!!Now married, except separated")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Females 15 years and over!!Now married, except separated"))
                },
                "Separated": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Females 15 years and over!!Separated")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Females 15 years and over!!Separated"))
                },
                "Widowed": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Females 15 years and over!!Widowed")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Females 15 years and over!!Widowed"))
                },
                "Divorced": {
                    "count": int(get_value("Estimate!!MARITAL STATUS!!Females 15 years and over!!Divorced")),
                    "percentage": float(get_value("Percent!!MARITAL STATUS!!Females 15 years and over!!Divorced"))
                }
            }
        },
        "EDUCATIONAL ATTAINMENT": {
                "Total": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over")),
                "Less than 9th grade": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Less than 9th grade")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Less than 9th grade"))
                },
                "9th to 12th grade, no diploma": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!9th to 12th grade, no diploma")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!9th to 12th grade, no diploma"))
                },
                "High school graduate": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate (includes equivalency)")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate (includes equivalency)"))
                },
                "Some college, no degree": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Some college, no degree")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Some college, no degree"))
                },
                "Associate's degree": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Associate's degree")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Associate's degree"))
                },
                "Bachelor's degree": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree"))
                },
                "Graduate or professional degree": {
                    "count": int(get_value("Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Graduate or professional degree")),
                    "percentage": float(get_value("Percent!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Graduate or professional degree"))
                }
        },
        "LANGUAGE SPOKEN AT HOME": {
                "Total": int(get_value("Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over")),
                "English only": {
                    "count": int(get_value("Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!English only")),
                    "percentage": float(get_value("Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!English only"))
                },
                "Spanish": {
                    "count": int(get_value("Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Spanish")),
                    "percentage": float(get_value("Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Spanish")),
                    "Speak English less than very well": {
                        "count": int(get_value('Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Spanish!!Speak English less than "very well"')),
                        "percentage": float(get_value('Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Spanish!!Speak English less than "very well"'))
                    }
                },
                "Other Indo-European languages": {
                    "count": int(get_value("Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other Indo-European languages")),
                    "percentage": float(get_value("Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other Indo-European languages")),
                    "Speak English less than very well": {
                        "count": int(get_value('Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other Indo-European languages!!Speak English less than "very well"')),
                        "percentage": float(get_value('Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other Indo-European languages!!Speak English less than "very well"'))
                    }
                },
                "Asian and Pacific Islander languages": {
                    "count": int(get_value("Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Asian and Pacific Islander languages")),
                    "percentage": float(get_value("Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Asian and Pacific Islander languages")),
                    "Speak English less than very well": {
                        "count": int(get_value('Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Asian and Pacific Islander languages!!Speak English less than "very well"')),
                        "percentage": float(get_value('Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Asian and Pacific Islander languages!!Speak English less than "very well"'))
                    }
                },
                "Other languages": {
                    "count": int(get_value("Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other languages")),
                    "percentage": float(get_value("Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other languages")),
                    "Speak English less than very well": {
                        "count": int(get_value('Estimate!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other languages!!Speak English less than "very well"')),
                        "percentage": float(get_value('Percent!!LANGUAGE SPOKEN AT HOME!!Population 5 years and over!!Other languages!!Speak English less than "very well"'))
                    }
                }
            },
        "VETERAN STATUS": {
            "Total": int(get_value("Estimate!!VETERAN STATUS!!Civilian population 18 years and over")),
            "Veteran": {
                "count": int(get_value("Estimate!!VETERAN STATUS!!Civilian population 18 years and over!!Civilian veterans")),
                "percentage": float(get_value("Percent!!VETERAN STATUS!!Civilian population 18 years and over!!Civilian veterans"))
            }
        },
        "PLACE OF BIRTH": {
            "Total": int(get_value("Estimate!!PLACE OF BIRTH!!Total population")),
            "Native": {
                "count": int(get_value("Estimate!!PLACE OF BIRTH!!Total population!!Native")),
                "percentage": float(get_value("Percent!!PLACE OF BIRTH!!Total population!!Native")),
                "State of residence": {
                    "count": int(get_value("Estimate!!PLACE OF BIRTH!!Total population!!Native!!Born in United States!!State of residence")),
                    "percentage": float(get_value("Percent!!PLACE OF BIRTH!!Total population!!Native!!Born in United States!!State of residence"))
                },
                "Different state": {
                    "count": int(get_value("Estimate!!PLACE OF BIRTH!!Total population!!Native!!Born in United States!!Different state")),
                    "percentage": float(get_value("Percent!!PLACE OF BIRTH!!Total population!!Native!!Born in United States!!Different state"))
                },
                "Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)   ": {
                    "count": int(get_value("Estimate!!PLACE OF BIRTH!!Total population!!Native!!Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)")),
                    "percentage": float(get_value("Percent!!PLACE OF BIRTH!!Total population!!Native!!Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)"))
                }
            },
            "Foreign born": {
                "count": int(get_value("Estimate!!PLACE OF BIRTH!!Total population!!Foreign-born")),
                "percentage": float(get_value("Percent!!PLACE OF BIRTH!!Total population!!Foreign-born")),
                "US Citizen": {
                    "count": int(get_value("Estimate!!U.S. CITIZENSHIP STATUS!!Foreign-born population!!Naturalized U.S. citizen")),
                    "percentage": float(get_value("Percent!!U.S. CITIZENSHIP STATUS!!Foreign-born population!!Naturalized U.S. citizen"))
                },
                "Not a U.S. citizen": {
                    "count": int(get_value("Estimate!!U.S. CITIZENSHIP STATUS!!Foreign-born population!!Not a U.S. citizen")),
                    "percentage": float(get_value("Percent!!U.S. CITIZENSHIP STATUS!!Foreign-born population!!Not a U.S. citizen"))
                },
                "Europe": {
                    "count": int(get_value("Estimate!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Europe")),
                    "percentage": float(get_value("Percent!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Europe"))
                },
                "Asia": {
                    "count": int(get_value("Estimate!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Asia")),
                    "percentage": float(get_value("Percent!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Asia"))
                },
                "Africa": {
                    "count": int(get_value("Estimate!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Africa")),
                    "percentage": float(get_value("Percent!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Africa"))
                },
                "Oceania": {
                    "count": int(get_value("Estimate!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Oceania")),
                    "percentage": float(get_value("Percent!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Oceania"))
                },
                "Latin America": {
                    "count": int(get_value("Estimate!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Latin America")),
                    "percentage": float(get_value("Percent!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Latin America"))
                },
                "Northern America": {
                    "count": int(get_value("Estimate!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Northern America")),
                    "percentage": float(get_value("Percent!!WORLD REGION OF BIRTH OF FOREIGN-BORN!!Foreign-born population, excluding population born at sea!!Northern America"))
                }
            }
        }
    }
    return result

# Read the CSV file
data_df = pd.read_csv('census_data_dp02.csv')

# Create the structured data
data = create_structured_data(data_df)

# You can now usedata the 'data' dictionary for further processing or analysis

In [144]:
# do this foimport os
import pandas as pd
import json
import os

# Directory containing the raw data files
raw_data_dir = "/user/al4263/Simulate/Persona/data/DP02/raw_data"

# Output directory for structured data
output_dir = "/user/al4263/Simulate/Persona/data/DP02/structured_data"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process each state file
for filename in os.listdir(raw_data_dir):
    if filename.endswith("_dp02.csv"):
        state = filename.split("_")[0]
        
        # Read the CSV file
        data_df = pd.read_csv(os.path.join(raw_data_dir, filename))
        
        # Create the structured data
        structured_data = create_structured_data(data_df)
        
        # Save the structured data as JSON
        output_filename = f"{state}_dp02_structured.json"
        with open(os.path.join(output_dir, output_filename), 'w') as f:
            json.dump(structured_data, f, indent=2)
        
        print(f"Processed and saved structured data for {state}")

print("All states processed successfully!")

Processed and saved structured data for nm
Processed and saved structured data for fl
Processed and saved structured data for vt
Processed and saved structured data for nd
Processed and saved structured data for al
Processed and saved structured data for mn
Processed and saved structured data for ak
Processed and saved structured data for in
Processed and saved structured data for az
Processed and saved structured data for sc
Processed and saved structured data for nv
Processed and saved structured data for nh
Processed and saved structured data for il
Processed and saved structured data for ia
Processed and saved structured data for ga
Processed and saved structured data for ky
Processed and saved structured data for nc
Processed and saved structured data for tn
Processed and saved structured data for wy
Processed and saved structured data for ma
Processed and saved structured data for mt
Processed and saved structured data for wi
Processed and saved structured data for ks
Processed a

## HouseHold

In [140]:
data["PLACE OF BIRTH"]

{'Total': 19571216,
 'Native': {'count': 15053220,
  'percentage': 76.9,
  'State of residence': {'count': 12146242, 'percentage': 62.1},
  'Different state': {'count': 2429014, 'percentage': 12.4},
  'Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)   ': {'count': 477964,
   'percentage': 2.4}},
 'Foreign born': {'count': 4517996,
  'percentage': 23.1,
  'US Citizen': {'count': 2687784, 'percentage': 59.5},
  'Not a U.S. citizen': {'count': 1830212, 'percentage': 40.5},
  'Europe': {'count': 671383, 'percentage': 14.9},
  'Asia': {'count': 1330190, 'percentage': 29.4},
  'Africa': {'count': 223031, 'percentage': 4.9},
  'Oceania': {'count': 17170, 'percentage': 0.4},
  'Latin America': {'count': 2218060, 'percentage': 49.1},
  'Northern America': {'count': 58109, 'percentage': 1.3}}}

## Age

In [141]:
import torch
from collections import namedtuple, Counter

CategoricalDistribution = namedtuple('CategoricalDistribution', ['categories', 'distribution'])

def create_birth_place_distribution(birth_data):
    categories = ['US Born', 'Foreign Born']
    probabilities = torch.tensor([
        birth_data['Native']['percentage'],
        birth_data['Foreign born']['percentage']
    ]) / 100  # Convert percentages to probabilities
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_us_born_distribution(birth_data):
    native_data = birth_data['Native']
    total_native = native_data['count']
    categories = ['State of residence', 'Different state', 'US territories or abroad to American parents']
    probabilities = torch.tensor([
        native_data['State of residence']['count'] / total_native,
        native_data['Different state']['count'] / total_native,
        native_data['Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)   ']['count'] / total_native
    ])
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_foreign_born_distribution(birth_data):
    foreign_data = birth_data['Foreign born']
    categories = ['US Citizen', 'Not a U.S. citizen']
    probabilities = torch.tensor([
        foreign_data['US Citizen']['percentage'],
        foreign_data['Not a U.S. citizen']['percentage']
    ]) / 100  # Convert percentages to probabilities
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_foreign_born_region_distribution(birth_data):
    foreign_data = birth_data['Foreign born']
    categories = ['Europe', 'Asia', 'Africa', 'Oceania', 'Latin America', 'Northern America']
    probabilities = torch.tensor([foreign_data[cat]['percentage'] for cat in categories]) / 100
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def sample_birth_and_citizenship(birth_data):
    birth_place_dist = create_birth_place_distribution(birth_data)
    us_born_dist = create_us_born_distribution(birth_data)
    foreign_born_dist = create_foreign_born_distribution(birth_data)
    foreign_region_dist = create_foreign_born_region_distribution(birth_data)

    birth_place = sample_distribution(birth_place_dist)
    
    if birth_place == 'US Born':
        us_birth_place = sample_distribution(us_born_dist)
        return {'Birth Place': 'US Born', 'Specific': us_birth_place, 'Citizenship': 'US Citizen'}
    else:
        citizenship = sample_distribution(foreign_born_dist)
        region = sample_distribution(foreign_region_dist)
        return {'Birth Place': 'Foreign Born', 'Citizenship': citizenship, 'Region': region}

def sample_distribution(distribution):
    return distribution.categories[distribution.distribution.sample()]

def sample_birth_and_citizenship_multiple(birth_data, num_samples=10000):
    samples = [sample_birth_and_citizenship(birth_data) for _ in range(num_samples)]
    return Counter(tuple(sorted(d.items())) for d in samples)

# Usage example:
birth_data = {
    'Total': 19571216,
    'Native': {'count': 15053220,
     'percentage': 76.9,
     'State of residence': {'count': 12146242, 'percentage': 62.1},
     'Different state': {'count': 2429014, 'percentage': 12.4},
     'Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)   ': {'count': 477964,
      'percentage': 2.4}},
    'Foreign born': {'count': 4517996,
     'percentage': 23.1,
     'US Citizen': {'count': 2687784, 'percentage': 59.5},
     'Not a U.S. citizen': {'count': 1830212, 'percentage': 40.5},
     'Europe': {'count': 671383, 'percentage': 14.9},
     'Asia': {'count': 1330190, 'percentage': 29.4},
     'Africa': {'count': 223031, 'percentage': 4.9},
     'Oceania': {'count': 17170, 'percentage': 0.4},
     'Latin America': {'count': 2218060, 'percentage': 49.1},
     'Northern America': {'count': 58109, 'percentage': 1.3}}
}

# Sample birth places and citizenship statuses
sample_counts = sample_birth_and_citizenship_multiple(birth_data, num_samples=10000)

# Print results
for status, count in sample_counts.items():
    print(f"{dict(status)}: {count}")

# Optional: Calculate and print percentages
total_samples = sum(sample_counts.values())
print("\nPercentages:")
for status, count in sample_counts.items():
    percentage = (count / total_samples) * 100
    print(f"{dict(status)}: {percentage:.1f}%")

{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Asia'}: 407
{'Birth Place': 'US Born', 'Citizenship': 'US Citizen', 'Specific': 'State of residence'}: 6204
{'Birth Place': 'US Born', 'Citizenship': 'US Citizen', 'Specific': 'Different state'}: 1283
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Latin America'}: 444
{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Europe'}: 205
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Asia'}: 272
{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Africa'}: 67
{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Latin America'}: 664
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Africa'}: 44
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Europe'}: 142
{'Birth Place': 'US Born', 'Citizenship': 'US Citizen', 'Specific': 'US territories

## Race

In [None]:
race_data = data['RACE']
main_race_categories = ['White', 'Black or African American', 'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some Other Race', 'Two or More Races']

race_counts = []
race_percentages = []
race_categories = []

for category in main_race_categories:
    if category in race_data:
        count = race_data[category]['count']
        percentage = race_data[category]['percentage']
        race_counts.append(count)
        race_percentages.append(percentage)
        race_categories.append(category)
    else:
        print(f"Category {category} not found in data")

# Normalize race percentages to sum to 1
total_race_percentage = sum(race_percentages)
race_probabilities = [p / total_race_percentage for p in race_percentages]
race_probs_tensor = torch.tensor(race_probabilities)

# Create Categorical distribution for Race
race_distribution = torch.distributions.Categorical(probs=race_probs_tensor)

print("\nRace categories:", race_categories)
print("Race probabilities:", race_probabilities)
print("Race distribution:", race_distribution.probs)

# Process subcategories for races with ethnicities
# For 'American Indian and Alaska Native'
ai_an_ethnicities = race_data['American Indian and Alaska Native']['Ethnicities']

ai_an_ethnicities_list = []
ai_an_percentages = []

for ethnicity, values in ai_an_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    ai_an_ethnicities_list.append(ethnicity)
    ai_an_percentages.append(percentage)

# Normalize percentages
total_ai_an_percentage = sum(ai_an_percentages)
ai_an_probabilities = [p / total_ai_an_percentage for p in ai_an_percentages]
ai_an_probs_tensor = torch.tensor(ai_an_probabilities)

# Create Categorical distribution
ai_an_distribution = torch.distributions.Categorical(probs=ai_an_probs_tensor)

print("\nAmerican Indian and Alaska Native Ethnicities:", ai_an_ethnicities_list)
print("Probabilities:", ai_an_probabilities)
print("Distribution:", ai_an_distribution.probs)

# For 'Asian'
asian_ethnicities = race_data['Asian']['Ethnicities']

asian_ethnicities_list = []
asian_percentages = []

for ethnicity, values in asian_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    asian_ethnicities_list.append(ethnicity)
    asian_percentages.append(percentage)

# Normalize percentages
total_asian_percentage = sum(asian_percentages)
asian_probabilities = [p / total_asian_percentage for p in asian_percentages]
asian_probs_tensor = torch.tensor(asian_probabilities)

# Create Categorical distribution
asian_distribution = torch.distributions.Categorical(probs=asian_probs_tensor)

print("\nAsian Ethnicities:", asian_ethnicities_list)
print("Probabilities:", asian_probabilities)
print("Distribution:", asian_distribution.probs)

# For 'Native Hawaiian and Other Pacific Islander'
nhopi_ethnicities = race_data['Native Hawaiian and Other Pacific Islander']['Ethnicities']

nhopi_ethnicities_list = []
nhopi_percentages = []

for ethnicity, values in nhopi_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    nhopi_ethnicities_list.append(ethnicity)
    nhopi_percentages.append(percentage)

# Normalize percentages
total_nhopi_percentage = sum(nhopi_percentages)
nhopi_probabilities = [p / total_nhopi_percentage for p in nhopi_percentages]
nhopi_probs_tensor = torch.tensor(nhopi_probabilities)

# Create Categorical distribution
nhopi_distribution = torch.distributions.Categorical(probs=nhopi_probs_tensor)

print("\nNative Hawaiian and Other Pacific Islander Ethnicities:", nhopi_ethnicities_list)
print("Probabilities:", nhopi_probabilities)
print("Distribution:", nhopi_distribution.probs)

# For 'Two or More Races'
two_or_more_races = race_data['Two or More Races']['Combinations']

combinations_list = []
combinations_percentages = []

for combination, values in two_or_more_races.items():
    count = values['count']
    percentage = values['percentage']
    combinations_list.append(combination)
    combinations_percentages.append(percentage)

# Normalize percentages
total_combinations_percentage = sum(combinations_percentages)
combinations_probabilities = [p / total_combinations_percentage for p in combinations_percentages]
combinations_probs_tensor = torch.tensor(combinations_probabilities)

# Create Categorical distribution
combinations_distribution = torch.distributions.Categorical(probs=combinations_probs_tensor)

print("\nTwo or More Races Combinations:", combinations_list)
print("Probabilities:", combinations_probabilities)
print("Distribution:", combinations_distribution.probs)

# Process 'HISPANIC OR LATINO AND RACE'
hispanic_data = data['HISPANIC OR LATINO AND RACE']
hispanic_categories = ['Hispanic or Latino (of any race)', 'Not Hispanic or Latino']

hispanic_counts = []
hispanic_percentages = []
hispanic_categories_list = []

for category in hispanic_categories:
    if category in hispanic_data:
        count = hispanic_data[category]['count']
        percentage = hispanic_data[category]['percentage']
        hispanic_counts.append(count)
        hispanic_percentages.append(percentage)
        hispanic_categories_list.append(category)

# Normalize percentages
total_hispanic_percentage = sum(hispanic_percentages)
hispanic_probabilities = [p / total_hispanic_percentage for p in hispanic_percentages]
hispanic_probs_tensor = torch.tensor(hispanic_probabilities)

# Create Categorical distribution
hispanic_distribution = torch.distributions.Categorical(probs=hispanic_probs_tensor)

print("\nHispanic or Latino Categories:", hispanic_categories_list)
print("Probabilities:", hispanic_probabilities)
print("Distribution:", hispanic_distribution.probs)

# Process 'Hispanic or Latino' subcategories
hispanic_subcategories = hispanic_data['Hispanic or Latino']

hispanic_subcategories_list = []
hispanic_sub_percentages = []

for subcategory, values in hispanic_subcategories.items():
    if subcategory != 'count' and subcategory != 'percentage':
        count = values['count']
        percentage = values['percentage']
        hispanic_subcategories_list.append(subcategory)
        hispanic_sub_percentages.append(percentage)

# Normalize percentages
total_hispanic_sub_percentage = sum(hispanic_sub_percentages)
hispanic_sub_probabilities = [p / total_hispanic_sub_percentage for p in hispanic_sub_percentages]
hispanic_sub_probs_tensor = torch.tensor(hispanic_sub_probabilities)

# Create Categorical distribution
hispanic_sub_distribution = torch.distributions.Categorical(probs=hispanic_sub_probs_tensor)

print("\nHispanic or Latino Subcategories:", hispanic_subcategories_list)
print("Probabilities:", hispanic_sub_probabilities)
print("Distribution:", hispanic_sub_distribution.probs)


Race categories: ['White', 'Black or African American', 'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some Other Race', 'Two or More Races']
Race probabilities: [0.5509550955095509, 0.14311431143114312, 0.0067006700670067, 0.0912091209120912, 0.0004000400040004, 0.1028102810281028, 0.10481048104810481]
Race distribution: tensor([5.5096e-01, 1.4311e-01, 6.7007e-03, 9.1209e-02, 4.0004e-04, 1.0281e-01,
        1.0481e-01])

American Indian and Alaska Native Ethnicities: ['Aztec', 'Blackfeet Tribe of the Blackfeet Indian Reservation of Montana', 'Maya', 'Native Village of Barrow Inupiat Traditional Government', 'Navajo Nation', 'Nome Eskimo Community', 'Other American Indian and Alaska Native']
Probabilities: [0.19468053194680532, 0.013298670132986704, 0.10498950104989502, 0.0, 0.006399360063993601, 0.0, 0.6806319368063194]
Distribution: tensor([0.1947, 0.0133, 0.1050, 0.0000, 0.0064, 0.0000, 0.6806])

Asian Ethnicities: ['Asian Indian', 'Chi

In [None]:
import pandas as pd

def sample_demographics():
    hispanic_status = hispanic_categories_list[hispanic_distribution.sample().item()]
    
    race = None
    ethnicity = None
    
    if hispanic_status == 'Hispanic or Latino (of any race)':
        hispanic_subtype = hispanic_subcategories_list[hispanic_sub_distribution.sample().item()]
        race = 'Hispanic'
        ethnicity = hispanic_subtype
    else:
        race = race_categories[race_distribution.sample().item()]
        
        if race == 'American Indian and Alaska Native':
            ethnicity = ai_an_ethnicities_list[ai_an_distribution.sample().item()]
        elif race == 'Asian':
            ethnicity = asian_ethnicities_list[asian_distribution.sample().item()]
        elif race == 'Native Hawaiian and Other Pacific Islander':
            ethnicity = nhopi_ethnicities_list[nhopi_distribution.sample().item()]
        elif race == 'Two or More Races':
            ethnicity = combinations_list[combinations_distribution.sample().item()]
    
    age_group = age_categories[age_distribution.sample().item()]
    sex = sex_categories[sex_distribution.sample().item()]

    return {
        'Race': race,
        'Ethnicity': ethnicity if ethnicity else race,
        'Age Group': age_group,
        'Sex': sex
    }

# Sample 100 demographic profiles
num_samples = 1000
samples = [sample_demographics() for _ in range(num_samples)]

# Create a DataFrame
df = pd.DataFrame(samples)

# Display the DataFrame
print(df)

                Race        Ethnicity       Age Group     Sex
0              Asian     Asian Indian  45 to 54 years    Male
1              White            White  35 to 44 years  Female
2           Hispanic            Cuban  35 to 44 years  Female
3              White            White  35 to 44 years    Male
4              White            White  35 to 44 years    Male
..               ...              ...             ...     ...
995  Some Other Race  Some Other Race  65 to 74 years    Male
996            White            White  65 to 74 years    Male
997            White            White  35 to 44 years  Female
998            White            White  60 to 64 years  Female
999            White            White  35 to 44 years  Female

[1000 rows x 4 columns]


In [None]:
# Calculate probabilities from the sampled data
sampled_probabilities = {
    'Race': df['Race'].value_counts(normalize=True),
    'Age Group': df['Age Group'].value_counts(normalize=True),
    'Sex': df['Sex'].value_counts(normalize=True)
}

# Compare with original probabilities
print("Race probabilities:")
print("Original:", dict(zip(race_categories, race_probabilities)))
print("Sampled:", sampled_probabilities['Race'].to_dict())
print()

print("Age Group probabilities:")
print("Original:", dict(zip(age_categories, age_probabilities)))
print("Sampled:", sampled_probabilities['Age Group'].to_dict())
print()

print("Sex probabilities:")
print("Original:", dict(zip(sex_categories, sex_probabilities)))
print("Sampled:", sampled_probabilities['Sex'].to_dict())
print()

# Calculate Hispanic proportion
hispanic_proportion = (df['Race'] == 'Hispanic').mean()
print("Hispanic proportion:")
print("Original:", hispanic_probabilities[0])
print("Sampled:", hispanic_proportion)
print()

# For Hispanic subtypes
if 'Hispanic' in df['Race'].unique():
    hispanic_subtypes = df[df['Race'] == 'Hispanic']['Ethnicity'].value_counts(normalize=True)
    print("Hispanic subtypes probabilities:")
    print("Original:", dict(zip(hispanic_subcategories_list, hispanic_sub_probabilities)))
    print("Sampled:", hispanic_subtypes.to_dict())

Race probabilities:
Original: {'White': 0.5509550955095509, 'Black or African American': 0.14311431143114312, 'American Indian and Alaska Native': 0.0067006700670067, 'Asian': 0.0912091209120912, 'Native Hawaiian and Other Pacific Islander': 0.0004000400040004, 'Some Other Race': 0.1028102810281028, 'Two or More Races': 0.10481048104810481}
Sampled: {'White': 0.479, 'Hispanic': 0.183, 'Black or African American': 0.107, 'Two or More Races': 0.08, 'Some Other Race': 0.08, 'Asian': 0.066, 'American Indian and Alaska Native': 0.004, 'Native Hawaiian and Other Pacific Islander': 0.001}

Age Group probabilities:
Original: {'18 to 19 years': 0.025699999999999997, '20 to 24 years': 0.0789, '25 to 34 years': 0.175, '35 to 44 years': 0.1634, '45 to 54 years': 0.1515, '55 to 59 years': 0.08199999999999999, '60 to 64 years': 0.0842, '65 to 74 years': 0.1334, '75 to 84 years': 0.0726, '85 years and over': 0.0268}
Sampled: {'35 to 44 years': 0.164, '25 to 34 years': 0.162, '45 to 54 years': 0.145, 