In [1]:
import requests
import pandas as pd

# Update the URL to fetch DP02 data
data_url = "https://api.census.gov/data/2023/acs/acs1/profile?get=group(DP03)&for=us:1"
response = requests.get(data_url)
raw_data = response.json()

metadata_url = "https://api.census.gov/data/2023/acs/acs1/profile/variables.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

# Update the mapping to filter for DP02 variables
dp03_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('DP03'):  # Only DP02 group variables
        dp03_mapping[var_code] = var_info['label']

variable_codes = raw_data[0]
values = raw_data[1]

data_df = pd.DataFrame({
    "Variable Code": variable_codes,
    "Value": values
})

data_df['Label'] = data_df['Variable Code'].map(dp03_mapping)

data_df = data_df[['Label', 'Value']].dropna()

import os
# Ensure the raw data directory exists
raw_data_dir = "/user/al4263/Simulate/Persona/data/DP03/raw_data"
os.makedirs(raw_data_dir, exist_ok=True)

# Save the nationwide data
filename = "us_dp03.csv"
data_df.to_csv(os.path.join(raw_data_dir, filename), index=False)
print(f"Saved {filename}")


Saved us_dp03.csv


In [15]:
import requests
import pandas as pd
import os
import us

def get_state_data(state_fips):
    data_url = f"https://api.census.gov/data/2023/acs/acs1/profile?get=group(DP03)&for=state:{state_fips}"
    response = requests.get(data_url)
    return response.json()

metadata_url = "https://api.census.gov/data/2023/acs/acs1/profile/variables.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

dp03_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('DP03'):  
        dp03_mapping[var_code] = var_info['label']

# Ensure the raw data directory exists
raw_data_dir = "/user/al4263/Simulate/Persona/data/DP03/raw_data"
os.makedirs(raw_data_dir, exist_ok=True)

for state in us.states.STATES:
    print(f"Processing {state.name}...")
    
    raw_data = get_state_data(state.fips)
    
    variable_codes = raw_data[0]
    values = raw_data[1]

    data_df = pd.DataFrame({
        "Variable Code": variable_codes,
        "Value": values
    })

    data_df['Label'] = data_df['Variable Code'].map(dp03_mapping)
    data_df = data_df[['Label', 'Value']].dropna()
    
    # Save the data with state-specific filename
    filename = f"{state.abbr.lower()}_dp03.csv"
    data_df.to_csv(os.path.join(raw_data_dir, filename), index=False)
    print(f"Saved {filename}")

print("All states processed successfully!")

Processing Alabama...
Saved al_dp03.csv
Processing Alaska...
Saved ak_dp03.csv
Processing Arizona...
Saved az_dp03.csv
Processing Arkansas...
Saved ar_dp03.csv
Processing California...
Saved ca_dp03.csv
Processing Colorado...
Saved co_dp03.csv
Processing Connecticut...
Saved ct_dp03.csv
Processing Delaware...
Saved de_dp03.csv
Processing Florida...
Saved fl_dp03.csv
Processing Georgia...
Saved ga_dp03.csv
Processing Hawaii...
Saved hi_dp03.csv
Processing Idaho...
Saved id_dp03.csv
Processing Illinois...
Saved il_dp03.csv
Processing Indiana...
Saved in_dp03.csv
Processing Iowa...
Saved ia_dp03.csv
Processing Kansas...
Saved ks_dp03.csv
Processing Kentucky...
Saved ky_dp03.csv
Processing Louisiana...
Saved la_dp03.csv
Processing Maine...
Saved me_dp03.csv
Processing Maryland...
Saved md_dp03.csv
Processing Massachusetts...
Saved ma_dp03.csv
Processing Michigan...
Saved mi_dp03.csv
Processing Minnesota...
Saved mn_dp03.csv
Processing Mississippi...
Saved ms_dp03.csv
Processing Missouri...

In [2]:
def create_structured_data(df):
    def get_value(label):
        return df[df["Label"] == label]["Value"].values[0]

    result = {
        "HEALTH INSURANCE COVERAGE": {
            "Civilian noninstitutionalized population 19 to 64 years": {
                "Total": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years")),
                "With health insurance coverage": {
                    "Employed": {
                        "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!With health insurance coverage")),
                        "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!With health insurance coverage")),
                        "Private health insurance": {
                            "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!With health insurance coverage!!With private health insurance")),
                            "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!With health insurance coverage!!With private health insurance"))
                        },
                        "Public coverage": {
                            "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!With health insurance coverage!!With public coverage")),
                            "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!With health insurance coverage!!With public coverage"))
                        }
                    },
                    "Unemployed": {
                        "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!With health insurance coverage")),
                        "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!With health insurance coverage")),
                        "Private health insurance": {
                            "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!With health insurance coverage!!With private health insurance")),
                            "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!With health insurance coverage!!With private health insurance"))
                        },
                        "Public coverage": {
                            "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!With health insurance coverage!!With public coverage")),
                            "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!With health insurance coverage!!With public coverage"))
                        }
                    },
                    "Not in labor force": {
                        "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!With health insurance coverage")),
                        "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!With health insurance coverage")),
                        "Private health insurance": {
                            "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!With health insurance coverage!!With private health insurance")),
                            "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!With health insurance coverage!!With private health insurance"))
                        },
                        "Public coverage": {
                            "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!With health insurance coverage!!With public coverage")),
                            "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!With health insurance coverage!!With public coverage"))
                        }
                    }
                },
                "No health insurance coverage": {
                    "Employed": {
                        "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!No health insurance coverage")),
                        "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Employed:!!No health insurance coverage"))
                    },
                    "Unemployed": {
                        "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!No health insurance coverage")),
                        "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!In labor force:!!Unemployed:!!No health insurance coverage"))
                    },
                    "Not in labor force": {
                        "count": int(get_value("Estimate!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!No health insurance coverage")),
                        "percentage": float(get_value("Percent!!HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population 19 to 64 years!!Not in labor force:!!No health insurance coverage"))
                    }
                }
            }
        },
        "INCOME AND BENEFITS": {
            "Total households": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households")),
            "Less than $10,000": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!Less than $10,000")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!Less than $10,000"))
            },
            "$10,000 to $14,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$10,000 to $14,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$10,000 to $14,999"))
            },
            "$15,000 to $24,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$15,000 to $24,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$15,000 to $24,999"))
            },
            "$25,000 to $34,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$25,000 to $34,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$25,000 to $34,999"))
            },
            "$35,000 to $49,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$35,000 to $49,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$35,000 to $49,999"))
            },
            "$50,000 to $74,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$50,000 to $74,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$50,000 to $74,999"))
            },
            "$75,000 to $99,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$75,000 to $99,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$75,000 to $99,999"))
            },
            "$100,000 to $149,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$100,000 to $149,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$100,000 to $149,999"))
            },
            "$150,000 to $199,999": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$150,000 to $199,999")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$150,000 to $199,999"))
            },
            "$200,000 or more": {
                "count": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$200,000 or more")),
                "percentage": float(get_value("Percent!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!$200,000 or more"))
            },
            "Median household income (dollars)": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!Median household income (dollars)")),
            "Mean household income (dollars)": int(get_value("Estimate!!INCOME AND BENEFITS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Total households!!Mean household income (dollars)"))
        }
    }
    return result

In [3]:
# do this foimport os
import pandas as pd
import json
import os

# Directory containing the raw data files
raw_data_dir = "/user/al4263/Simulate/Persona/data/DP03/raw_data"

# Output directory for structured data
output_dir = "/user/al4263/Simulate/Persona/data/DP03/structured_data"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process each state file
for filename in os.listdir(raw_data_dir):
    if filename.endswith("_dp03.csv"):
        state = filename.split("_")[0]
        
        # Read the CSV file
        data_df = pd.read_csv(os.path.join(raw_data_dir, filename))
        
        # Create the structured data
        structured_data = create_structured_data(data_df)
        
        # Save the structured data as JSON
        output_filename = f"{state}_dp03_structured.json"
        with open(os.path.join(output_dir, output_filename), 'w') as f:
            json.dump(structured_data, f, indent=2)
        
        print(f"Processed and saved structured data for {state}")

print("All states processed successfully!")

Processed and saved structured data for ms
Processed and saved structured data for sc
Processed and saved structured data for de
Processed and saved structured data for ma
Processed and saved structured data for fl
Processed and saved structured data for mo
Processed and saved structured data for nc
Processed and saved structured data for mi
Processed and saved structured data for us
Processed and saved structured data for id
Processed and saved structured data for nj
Processed and saved structured data for vt
Processed and saved structured data for nv
Processed and saved structured data for la
Processed and saved structured data for tn
Processed and saved structured data for ut
Processed and saved structured data for ny
Processed and saved structured data for md
Processed and saved structured data for sd
Processed and saved structured data for ia
Processed and saved structured data for al
Processed and saved structured data for wi
Processed and saved structured data for ne
Processed a

## HouseHold

In [9]:
import os
import json
import pandas as pd
from tqdm import tqdm
import us
from get_dist.get_sex_race_age_dist import load_structured_data, get_sex_dist, get_age_dist, get_race_dist, sample_demographics
from get_dist.get_household_dist import sample_household_type, sample_relationship
from get_dist.get_marital_status_dist import sample_marital_status_by_gender
from get_dist.get_veteran_dist import sample_veteran_status
from get_dist.get_language_dist import sample_languages
from get_dist.get_edu_dist import sample_education_level
from get_dist.get_birth_dist import sample_birth_and_citizenship_multiple
from get_dist.get_employment_dist import sample_labor_force_and_employment
from get_dist.get_career_dist import generate_career

def generate_personas_for_state(state):
    # Load data
    dp05 = f"/user/al4263/Simulate/Persona/data/DP05/structured_data/{state.abbr.lower()}_structured_data.json"
    dp02 = f"/user/al4263/Simulate/Persona/data/DP02/structured_data/{state.abbr.lower()}_dp02_structured.json"
    s2301 = f"/user/al4263/Simulate/Persona/data/s2301/structured_data/{state.abbr.lower()}_s2301_structured.json"
    s2401 = f"/user/al4263/Simulate/Persona/data/s2401/structured_data/{state.abbr.lower()}_s2401_structured.json"

    dp05_data = load_structured_data(dp05)
    dp02_data = load_structured_data(dp02)
    s2301_data = load_structured_data(s2301)
    s2401_data = load_structured_data(s2401)

    # Get distributions
    age_distributions = get_age_dist(dp05_data)
    sex_distributions = get_sex_dist(dp05_data)
    race_distributions = get_race_dist(dp05_data)

    # Sample demographics
    num_samples = 1000
    samples = [sample_demographics(age_distributions, sex_distributions, race_distributions) for _ in range(num_samples)]

    # Create DataFrame
    df = pd.DataFrame(samples)

    # Add HOUSEHOLD_RELATIONSHIP
    df['HOUSEHOLD_RELATIONSHIP'] = df.apply(lambda row: sample_relationship(dp02_data, dp05_data), axis=1)

    # Sample household type for primary householders
    def sample_household_type_for_primary(row):
        if row['HOUSEHOLD_RELATIONSHIP'] == 'Primary Householder':
            return sample_household_type(dp02_data, row['Sex'])
        return None

    df['HOUSEHOLD_TYPE'] = df.apply(sample_household_type_for_primary, axis=1)

    # Sample additional attributes
    df['MARITAL_STATUS'] = df.apply(lambda row: sample_marital_status_by_gender(dp02_data, row['Sex']), axis=1)
    df['VETERAN_STATUS'] = df.apply(lambda row: sample_veteran_status(dp02_data), axis=1)
    df['LANGUAGE'], df['ENGLISH_PROFICIENCY'] = zip(*df.apply(lambda row: sample_languages(dp02_data), axis=1))
    df['EDUCATION'] = df.apply(lambda row: sample_education_level(dp02_data), axis=1)
    df['BIRTH_PLACE'], df['CITIZENSHIP'], df['BIRTH_DETAIL'] = zip(*df.apply(lambda row: sample_birth_and_citizenship_multiple(dp02_data), axis=1))

    # Sample employment status
    df['Labor Force Status'], df['Employment Status'] = zip(*df['Age'].apply(lambda x: sample_labor_force_and_employment(x, s2301_data)))
    df.columns = df.columns.str.upper()
    df['CAREER'] = df.apply(lambda row: generate_career(row, s2401_data), axis=1)

    # Add state information
    df['STATE_NAME'] = state.name
    df['STATE_ABBR'] = state.abbr

    # upper case all the column names
    df.columns = df.columns.str.upper()

    # replace all null with "NOT APPLICABLE"
    df.fillna("Not Applicable", inplace=True)

    return df

state = us.states.CA
df = generate_personas_for_state(state)

In [11]:
import numpy as np
import torch
from collections import namedtuple

CategoricalDistribution = namedtuple('CategoricalDistribution', ['categories', 'distribution'])

def create_insurance_distribution(data, employment_status):
    categories = ['Private health insurance', 'Public coverage', 'No health insurance coverage']
    
    if employment_status == 'Employed':
        private = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['With health insurance coverage']['Employed']['Private health insurance']['percentage']
        public = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['With health insurance coverage']['Employed']['Public coverage']['percentage']
        none = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['No health insurance coverage']['Employed']['percentage']
    elif employment_status == 'Unemployed':
        private = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['With health insurance coverage']['Unemployed']['Private health insurance']['percentage']
        public = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['With health insurance coverage']['Unemployed']['Public coverage']['percentage']
        none = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['No health insurance coverage']['Unemployed']['percentage']
    else:  # Not in labor force
        private = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['With health insurance coverage']['Not in labor force']['Private health insurance']['percentage']
        public = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['With health insurance coverage']['Not in labor force']['Public coverage']['percentage']
        none = data['HEALTH INSURANCE COVERAGE']['Civilian noninstitutionalized population 19 to 64 years']['No health insurance coverage']['Not in labor force']['percentage']
    
    probabilities = torch.tensor([private, public, none]) / 100
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_income_distribution(data):
    categories = [
        'Less than $10,000', '$10,000 to $14,999', '$15,000 to $24,999',
        '$25,000 to $34,999', '$35,000 to $49,999', '$50,000 to $74,999',
        '$75,000 to $99,999', '$100,000 to $149,999', '$150,000 to $199,999',
        '$200,000 or more'
    ]
    probabilities = torch.tensor([
        data['INCOME AND BENEFITS'][cat]['percentage'] for cat in categories
    ]) / 100
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def sample_insurance_and_income(row, data):
    employment_status = row['EMPLOYMENT STATUS']
    labor_force_status = row['LABOR FORCE STATUS']
    
    if labor_force_status == 'Not in Labor Force':
        insurance_dist = create_insurance_distribution(data, 'Not in labor force')
    else:
        insurance_dist = create_insurance_distribution(data, employment_status)
    
    income_dist = create_income_distribution(data)
    
    insurance = insurance_dist.categories[insurance_dist.distribution.sample()]
    income = income_dist.categories[income_dist.distribution.sample()]
    
    return insurance, income

# Assuming 'data' is your dictionary with the census data
# and 'df' is your DataFrame with the existing columns

# Apply sampling to the DataFrame
df[['INSURANCE_COVERAGE', 'INCOME_RANGE']] = df.apply(lambda row: sample_insurance_and_income(row, data), axis=1, result_type='expand')

# Display the updated DataFrame
print(df[['AGE', 'SEX', 'RACE', 'LABOR FORCE STATUS', 'EMPLOYMENT STATUS', 'INSURANCE_COVERAGE', 'INCOME_RANGE']])

                   AGE     SEX             RACE  LABOR FORCE STATUS  \
0       45 to 54 years    Male         Hispanic      In Labor Force   
1    85 years and over    Male         Hispanic      In Labor Force   
2       25 to 34 years    Male         Hispanic  Not in Labor Force   
3       25 to 34 years  Female         Hispanic      In Labor Force   
4       45 to 54 years    Male            White      In Labor Force   
..                 ...     ...              ...                 ...   
995     35 to 44 years    Male            White      In Labor Force   
996     45 to 54 years  Female         Hispanic      In Labor Force   
997     35 to 44 years  Female  Some Other Race      In Labor Force   
998     35 to 44 years    Male         Hispanic  Not in Labor Force   
999     35 to 44 years    Male            White      In Labor Force   

    EMPLOYMENT STATUS            INSURANCE_COVERAGE          INCOME_RANGE  
0            Employed               Public coverage  $150,000 to $199,9

Processing Alabama...
Saved al_s2301.csv
Processing Alaska...
Saved ak_s2301.csv
Processing Arizona...
Saved az_s2301.csv
Processing Arkansas...
Saved ar_s2301.csv
Processing California...
Saved ca_s2301.csv
Processing Colorado...
Saved co_s2301.csv
Processing Connecticut...
Saved ct_s2301.csv
Processing Delaware...
Saved de_s2301.csv
Processing Florida...
Saved fl_s2301.csv
Processing Georgia...
Saved ga_s2301.csv
Processing Hawaii...
Saved hi_s2301.csv
Processing Idaho...
Saved id_s2301.csv
Processing Illinois...
Saved il_s2301.csv
Processing Indiana...
Saved in_s2301.csv
Processing Iowa...
Saved ia_s2301.csv
Processing Kansas...
Saved ks_s2301.csv
Processing Kentucky...
Saved ky_s2301.csv
Processing Louisiana...
Saved la_s2301.csv
Processing Maine...
Saved me_s2301.csv
Processing Maryland...
Saved md_s2301.csv
Processing Massachusetts...
Saved ma_s2301.csv
Processing Michigan...
Saved mi_s2301.csv
Processing Minnesota...
Saved mn_s2301.csv
Processing Mississippi...
Saved ms_s2301.cs

## Age

In [141]:
import torch
from collections import namedtuple, Counter

CategoricalDistribution = namedtuple('CategoricalDistribution', ['categories', 'distribution'])

def create_birth_place_distribution(birth_data):
    categories = ['US Born', 'Foreign Born']
    probabilities = torch.tensor([
        birth_data['Native']['percentage'],
        birth_data['Foreign born']['percentage']
    ]) / 100  # Convert percentages to probabilities
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_us_born_distribution(birth_data):
    native_data = birth_data['Native']
    total_native = native_data['count']
    categories = ['State of residence', 'Different state', 'US territories or abroad to American parents']
    probabilities = torch.tensor([
        native_data['State of residence']['count'] / total_native,
        native_data['Different state']['count'] / total_native,
        native_data['Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)   ']['count'] / total_native
    ])
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_foreign_born_distribution(birth_data):
    foreign_data = birth_data['Foreign born']
    categories = ['US Citizen', 'Not a U.S. citizen']
    probabilities = torch.tensor([
        foreign_data['US Citizen']['percentage'],
        foreign_data['Not a U.S. citizen']['percentage']
    ]) / 100  # Convert percentages to probabilities
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_foreign_born_region_distribution(birth_data):
    foreign_data = birth_data['Foreign born']
    categories = ['Europe', 'Asia', 'Africa', 'Oceania', 'Latin America', 'Northern America']
    probabilities = torch.tensor([foreign_data[cat]['percentage'] for cat in categories]) / 100
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def sample_birth_and_citizenship(birth_data):
    birth_place_dist = create_birth_place_distribution(birth_data)
    us_born_dist = create_us_born_distribution(birth_data)
    foreign_born_dist = create_foreign_born_distribution(birth_data)
    foreign_region_dist = create_foreign_born_region_distribution(birth_data)

    birth_place = sample_distribution(birth_place_dist)
    
    if birth_place == 'US Born':
        us_birth_place = sample_distribution(us_born_dist)
        return {'Birth Place': 'US Born', 'Specific': us_birth_place, 'Citizenship': 'US Citizen'}
    else:
        citizenship = sample_distribution(foreign_born_dist)
        region = sample_distribution(foreign_region_dist)
        return {'Birth Place': 'Foreign Born', 'Citizenship': citizenship, 'Region': region}

def sample_distribution(distribution):
    return distribution.categories[distribution.distribution.sample()]

def sample_birth_and_citizenship_multiple(birth_data, num_samples=10000):
    samples = [sample_birth_and_citizenship(birth_data) for _ in range(num_samples)]
    return Counter(tuple(sorted(d.items())) for d in samples)

# Usage example:
birth_data = {
    'Total': 19571216,
    'Native': {'count': 15053220,
     'percentage': 76.9,
     'State of residence': {'count': 12146242, 'percentage': 62.1},
     'Different state': {'count': 2429014, 'percentage': 12.4},
     'Born in Puerto Rico, U.S. Island areas, or born abroad to American parent(s)   ': {'count': 477964,
      'percentage': 2.4}},
    'Foreign born': {'count': 4517996,
     'percentage': 23.1,
     'US Citizen': {'count': 2687784, 'percentage': 59.5},
     'Not a U.S. citizen': {'count': 1830212, 'percentage': 40.5},
     'Europe': {'count': 671383, 'percentage': 14.9},
     'Asia': {'count': 1330190, 'percentage': 29.4},
     'Africa': {'count': 223031, 'percentage': 4.9},
     'Oceania': {'count': 17170, 'percentage': 0.4},
     'Latin America': {'count': 2218060, 'percentage': 49.1},
     'Northern America': {'count': 58109, 'percentage': 1.3}}
}

# Sample birth places and citizenship statuses
sample_counts = sample_birth_and_citizenship_multiple(birth_data, num_samples=10000)

# Print results
for status, count in sample_counts.items():
    print(f"{dict(status)}: {count}")

# Optional: Calculate and print percentages
total_samples = sum(sample_counts.values())
print("\nPercentages:")
for status, count in sample_counts.items():
    percentage = (count / total_samples) * 100
    print(f"{dict(status)}: {percentage:.1f}%")

{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Asia'}: 407
{'Birth Place': 'US Born', 'Citizenship': 'US Citizen', 'Specific': 'State of residence'}: 6204
{'Birth Place': 'US Born', 'Citizenship': 'US Citizen', 'Specific': 'Different state'}: 1283
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Latin America'}: 444
{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Europe'}: 205
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Asia'}: 272
{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Africa'}: 67
{'Birth Place': 'Foreign Born', 'Citizenship': 'US Citizen', 'Region': 'Latin America'}: 664
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Africa'}: 44
{'Birth Place': 'Foreign Born', 'Citizenship': 'Not a U.S. citizen', 'Region': 'Europe'}: 142
{'Birth Place': 'US Born', 'Citizenship': 'US Citizen', 'Specific': 'US territories

## Race

In [None]:
race_data = data['RACE']
main_race_categories = ['White', 'Black or African American', 'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some Other Race', 'Two or More Races']

race_counts = []
race_percentages = []
race_categories = []

for category in main_race_categories:
    if category in race_data:
        count = race_data[category]['count']
        percentage = race_data[category]['percentage']
        race_counts.append(count)
        race_percentages.append(percentage)
        race_categories.append(category)
    else:
        print(f"Category {category} not found in data")

# Normalize race percentages to sum to 1
total_race_percentage = sum(race_percentages)
race_probabilities = [p / total_race_percentage for p in race_percentages]
race_probs_tensor = torch.tensor(race_probabilities)

# Create Categorical distribution for Race
race_distribution = torch.distributions.Categorical(probs=race_probs_tensor)

print("\nRace categories:", race_categories)
print("Race probabilities:", race_probabilities)
print("Race distribution:", race_distribution.probs)

# Process subcategories for races with ethnicities
# For 'American Indian and Alaska Native'
ai_an_ethnicities = race_data['American Indian and Alaska Native']['Ethnicities']

ai_an_ethnicities_list = []
ai_an_percentages = []

for ethnicity, values in ai_an_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    ai_an_ethnicities_list.append(ethnicity)
    ai_an_percentages.append(percentage)

# Normalize percentages
total_ai_an_percentage = sum(ai_an_percentages)
ai_an_probabilities = [p / total_ai_an_percentage for p in ai_an_percentages]
ai_an_probs_tensor = torch.tensor(ai_an_probabilities)

# Create Categorical distribution
ai_an_distribution = torch.distributions.Categorical(probs=ai_an_probs_tensor)

print("\nAmerican Indian and Alaska Native Ethnicities:", ai_an_ethnicities_list)
print("Probabilities:", ai_an_probabilities)
print("Distribution:", ai_an_distribution.probs)

# For 'Asian'
asian_ethnicities = race_data['Asian']['Ethnicities']

asian_ethnicities_list = []
asian_percentages = []

for ethnicity, values in asian_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    asian_ethnicities_list.append(ethnicity)
    asian_percentages.append(percentage)

# Normalize percentages
total_asian_percentage = sum(asian_percentages)
asian_probabilities = [p / total_asian_percentage for p in asian_percentages]
asian_probs_tensor = torch.tensor(asian_probabilities)

# Create Categorical distribution
asian_distribution = torch.distributions.Categorical(probs=asian_probs_tensor)

print("\nAsian Ethnicities:", asian_ethnicities_list)
print("Probabilities:", asian_probabilities)
print("Distribution:", asian_distribution.probs)

# For 'Native Hawaiian and Other Pacific Islander'
nhopi_ethnicities = race_data['Native Hawaiian and Other Pacific Islander']['Ethnicities']

nhopi_ethnicities_list = []
nhopi_percentages = []

for ethnicity, values in nhopi_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    nhopi_ethnicities_list.append(ethnicity)
    nhopi_percentages.append(percentage)

# Normalize percentages
total_nhopi_percentage = sum(nhopi_percentages)
nhopi_probabilities = [p / total_nhopi_percentage for p in nhopi_percentages]
nhopi_probs_tensor = torch.tensor(nhopi_probabilities)

# Create Categorical distribution
nhopi_distribution = torch.distributions.Categorical(probs=nhopi_probs_tensor)

print("\nNative Hawaiian and Other Pacific Islander Ethnicities:", nhopi_ethnicities_list)
print("Probabilities:", nhopi_probabilities)
print("Distribution:", nhopi_distribution.probs)

# For 'Two or More Races'
two_or_more_races = race_data['Two or More Races']['Combinations']

combinations_list = []
combinations_percentages = []

for combination, values in two_or_more_races.items():
    count = values['count']
    percentage = values['percentage']
    combinations_list.append(combination)
    combinations_percentages.append(percentage)

# Normalize percentages
total_combinations_percentage = sum(combinations_percentages)
combinations_probabilities = [p / total_combinations_percentage for p in combinations_percentages]
combinations_probs_tensor = torch.tensor(combinations_probabilities)

# Create Categorical distribution
combinations_distribution = torch.distributions.Categorical(probs=combinations_probs_tensor)

print("\nTwo or More Races Combinations:", combinations_list)
print("Probabilities:", combinations_probabilities)
print("Distribution:", combinations_distribution.probs)

# Process 'HISPANIC OR LATINO AND RACE'
hispanic_data = data['HISPANIC OR LATINO AND RACE']
hispanic_categories = ['Hispanic or Latino (of any race)', 'Not Hispanic or Latino']

hispanic_counts = []
hispanic_percentages = []
hispanic_categories_list = []

for category in hispanic_categories:
    if category in hispanic_data:
        count = hispanic_data[category]['count']
        percentage = hispanic_data[category]['percentage']
        hispanic_counts.append(count)
        hispanic_percentages.append(percentage)
        hispanic_categories_list.append(category)

# Normalize percentages
total_hispanic_percentage = sum(hispanic_percentages)
hispanic_probabilities = [p / total_hispanic_percentage for p in hispanic_percentages]
hispanic_probs_tensor = torch.tensor(hispanic_probabilities)

# Create Categorical distribution
hispanic_distribution = torch.distributions.Categorical(probs=hispanic_probs_tensor)

print("\nHispanic or Latino Categories:", hispanic_categories_list)
print("Probabilities:", hispanic_probabilities)
print("Distribution:", hispanic_distribution.probs)

# Process 'Hispanic or Latino' subcategories
hispanic_subcategories = hispanic_data['Hispanic or Latino']

hispanic_subcategories_list = []
hispanic_sub_percentages = []

for subcategory, values in hispanic_subcategories.items():
    if subcategory != 'count' and subcategory != 'percentage':
        count = values['count']
        percentage = values['percentage']
        hispanic_subcategories_list.append(subcategory)
        hispanic_sub_percentages.append(percentage)

# Normalize percentages
total_hispanic_sub_percentage = sum(hispanic_sub_percentages)
hispanic_sub_probabilities = [p / total_hispanic_sub_percentage for p in hispanic_sub_percentages]
hispanic_sub_probs_tensor = torch.tensor(hispanic_sub_probabilities)

# Create Categorical distribution
hispanic_sub_distribution = torch.distributions.Categorical(probs=hispanic_sub_probs_tensor)

print("\nHispanic or Latino Subcategories:", hispanic_subcategories_list)
print("Probabilities:", hispanic_sub_probabilities)
print("Distribution:", hispanic_sub_distribution.probs)


Race categories: ['White', 'Black or African American', 'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some Other Race', 'Two or More Races']
Race probabilities: [0.5509550955095509, 0.14311431143114312, 0.0067006700670067, 0.0912091209120912, 0.0004000400040004, 0.1028102810281028, 0.10481048104810481]
Race distribution: tensor([5.5096e-01, 1.4311e-01, 6.7007e-03, 9.1209e-02, 4.0004e-04, 1.0281e-01,
        1.0481e-01])

American Indian and Alaska Native Ethnicities: ['Aztec', 'Blackfeet Tribe of the Blackfeet Indian Reservation of Montana', 'Maya', 'Native Village of Barrow Inupiat Traditional Government', 'Navajo Nation', 'Nome Eskimo Community', 'Other American Indian and Alaska Native']
Probabilities: [0.19468053194680532, 0.013298670132986704, 0.10498950104989502, 0.0, 0.006399360063993601, 0.0, 0.6806319368063194]
Distribution: tensor([0.1947, 0.0133, 0.1050, 0.0000, 0.0064, 0.0000, 0.6806])

Asian Ethnicities: ['Asian Indian', 'Chi

In [None]:
import pandas as pd

def sample_demographics():
    hispanic_status = hispanic_categories_list[hispanic_distribution.sample().item()]
    
    race = None
    ethnicity = None
    
    if hispanic_status == 'Hispanic or Latino (of any race)':
        hispanic_subtype = hispanic_subcategories_list[hispanic_sub_distribution.sample().item()]
        race = 'Hispanic'
        ethnicity = hispanic_subtype
    else:
        race = race_categories[race_distribution.sample().item()]
        
        if race == 'American Indian and Alaska Native':
            ethnicity = ai_an_ethnicities_list[ai_an_distribution.sample().item()]
        elif race == 'Asian':
            ethnicity = asian_ethnicities_list[asian_distribution.sample().item()]
        elif race == 'Native Hawaiian and Other Pacific Islander':
            ethnicity = nhopi_ethnicities_list[nhopi_distribution.sample().item()]
        elif race == 'Two or More Races':
            ethnicity = combinations_list[combinations_distribution.sample().item()]
    
    age_group = age_categories[age_distribution.sample().item()]
    sex = sex_categories[sex_distribution.sample().item()]

    return {
        'Race': race,
        'Ethnicity': ethnicity if ethnicity else race,
        'Age Group': age_group,
        'Sex': sex
    }

# Sample 100 demographic profiles
num_samples = 1000
samples = [sample_demographics() for _ in range(num_samples)]

# Create a DataFrame
df = pd.DataFrame(samples)

# Display the DataFrame
print(df)

                Race        Ethnicity       Age Group     Sex
0              Asian     Asian Indian  45 to 54 years    Male
1              White            White  35 to 44 years  Female
2           Hispanic            Cuban  35 to 44 years  Female
3              White            White  35 to 44 years    Male
4              White            White  35 to 44 years    Male
..               ...              ...             ...     ...
995  Some Other Race  Some Other Race  65 to 74 years    Male
996            White            White  65 to 74 years    Male
997            White            White  35 to 44 years  Female
998            White            White  60 to 64 years  Female
999            White            White  35 to 44 years  Female

[1000 rows x 4 columns]


In [None]:
# Calculate probabilities from the sampled data
sampled_probabilities = {
    'Race': df['Race'].value_counts(normalize=True),
    'Age Group': df['Age Group'].value_counts(normalize=True),
    'Sex': df['Sex'].value_counts(normalize=True)
}

# Compare with original probabilities
print("Race probabilities:")
print("Original:", dict(zip(race_categories, race_probabilities)))
print("Sampled:", sampled_probabilities['Race'].to_dict())
print()

print("Age Group probabilities:")
print("Original:", dict(zip(age_categories, age_probabilities)))
print("Sampled:", sampled_probabilities['Age Group'].to_dict())
print()

print("Sex probabilities:")
print("Original:", dict(zip(sex_categories, sex_probabilities)))
print("Sampled:", sampled_probabilities['Sex'].to_dict())
print()

# Calculate Hispanic proportion
hispanic_proportion = (df['Race'] == 'Hispanic').mean()
print("Hispanic proportion:")
print("Original:", hispanic_probabilities[0])
print("Sampled:", hispanic_proportion)
print()

# For Hispanic subtypes
if 'Hispanic' in df['Race'].unique():
    hispanic_subtypes = df[df['Race'] == 'Hispanic']['Ethnicity'].value_counts(normalize=True)
    print("Hispanic subtypes probabilities:")
    print("Original:", dict(zip(hispanic_subcategories_list, hispanic_sub_probabilities)))
    print("Sampled:", hispanic_subtypes.to_dict())

Race probabilities:
Original: {'White': 0.5509550955095509, 'Black or African American': 0.14311431143114312, 'American Indian and Alaska Native': 0.0067006700670067, 'Asian': 0.0912091209120912, 'Native Hawaiian and Other Pacific Islander': 0.0004000400040004, 'Some Other Race': 0.1028102810281028, 'Two or More Races': 0.10481048104810481}
Sampled: {'White': 0.479, 'Hispanic': 0.183, 'Black or African American': 0.107, 'Two or More Races': 0.08, 'Some Other Race': 0.08, 'Asian': 0.066, 'American Indian and Alaska Native': 0.004, 'Native Hawaiian and Other Pacific Islander': 0.001}

Age Group probabilities:
Original: {'18 to 19 years': 0.025699999999999997, '20 to 24 years': 0.0789, '25 to 34 years': 0.175, '35 to 44 years': 0.1634, '45 to 54 years': 0.1515, '55 to 59 years': 0.08199999999999999, '60 to 64 years': 0.0842, '65 to 74 years': 0.1334, '75 to 84 years': 0.0726, '85 years and over': 0.0268}
Sampled: {'35 to 44 years': 0.164, '25 to 34 years': 0.162, '45 to 54 years': 0.145, 