In [1]:
import requests
import pandas as pd

data_url = "https://api.census.gov/data/2023/acs/acs1/profile?get=group(DP05)&ucgid=0400000US36"
response = requests.get(data_url)
raw_data = response.json()

metadata_url = "https://api.census.gov/data/2023/acs/acs1/profile/variables.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

dp05_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('DP05'):  # Only DP05 group variables
        dp05_mapping[var_code] = var_info['label']

variable_codes = raw_data[0]
values = raw_data[1]

data_df = pd.DataFrame({
    "Variable Code": variable_codes,
    "Value": values
})

data_df['Label'] = data_df['Variable Code'].map(dp05_mapping)

data_df = data_df[['Label', 'Value']].dropna()
data_df.to_csv('census_data_dp05.csv', index=False)

In [2]:
data_df = pd.read_csv('census_data_dp05.csv')

In [3]:
import pandas as pd

def create_structured_data(df):
    def get_value(label):
        return int(df[df["Label"] == label]["Value"].values[0])

    def calculate_percentage(part, whole):
        return round((part / whole) * 100, 2)

    total_population = get_value("Estimate!!SEX AND AGE!!Total population")
    
    under_20 = (
        get_value("Estimate!!SEX AND AGE!!Total population!!Under 5 years") +
        get_value("Estimate!!SEX AND AGE!!Total population!!5 to 9 years") +
        get_value("Estimate!!SEX AND AGE!!Total population!!10 to 14 years") +
        get_value("Estimate!!SEX AND AGE!!Total population!!15 to 19 years")
    )
    under_18 = total_population - get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over")
    age_18_19 = under_20 - under_18
    
    result = {
        "SEX AND AGE": {
            "Total population": total_population,
            "Age Groups": {
                "18 to 19 years": {"count": age_18_19, "percentage": calculate_percentage(age_18_19, total_population)},
                "20 to 24 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!20 to 24 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!20 to 24 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "25 to 34 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!25 to 34 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!25 to 34 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "35 to 44 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!35 to 44 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!35 to 44 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "45 to 54 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!45 to 54 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!45 to 54 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "55 to 59 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!55 to 59 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!55 to 59 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "60 to 64 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!60 to 64 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!60 to 64 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "65 to 74 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!65 to 74 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!65 to 74 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "75 to 84 years": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!75 to 84 years"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!75 to 84 years"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
                "85 years and over": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!85 years and over"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!85 years and over"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))}
            },
            "18 years and over": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
            "18 years and over Male": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over!!Male"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over!!Male"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
            "18 years and over Female": {"count": get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over!!Female"), "percentage": calculate_percentage(get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over!!Female"), get_value("Estimate!!SEX AND AGE!!Total population!!18 years and over"))},
            "Median age (years)": float(df[df["Label"] == "Estimate!!SEX AND AGE!!Total population!!Median age (years)"]["Value"].values[0])
        },
        "RACE": {
            "Total population": total_population,
            "Two or More Races": {
                "count": get_value("Estimate!!RACE!!Total population!!Two or More Races"),
                "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races"), total_population),
                "Combinations": {
                    "White and Black or African American": {"count": get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and Black or African American"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and Black or African American"), get_value("Estimate!!RACE!!Total population!!Two or More Races"))},
                    "White and American Indian and Alaska Native": {"count": get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and American Indian and Alaska Native"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and American Indian and Alaska Native"), get_value("Estimate!!RACE!!Total population!!Two or More Races"))},
                    "White and Asian": {"count": get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and Asian"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and Asian"), get_value("Estimate!!RACE!!Total population!!Two or More Races"))},
                    "White and Some Other Race": {"count": get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and Some Other Race"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races!!White and Some Other Race"), get_value("Estimate!!RACE!!Total population!!Two or More Races"))},
                    "Black or African American and American Indian and Alaska Native": {"count": get_value("Estimate!!RACE!!Total population!!Two or More Races!!Black or African American and American Indian and Alaska Native"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races!!Black or African American and American Indian and Alaska Native"), get_value("Estimate!!RACE!!Total population!!Two or More Races"))},
                    "Black or African American and Some Other Race": {"count": get_value("Estimate!!RACE!!Total population!!Two or More Races!!Black or African American and Some Other Race"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!Two or More Races!!Black or African American and Some Other Race"), get_value("Estimate!!RACE!!Total population!!Two or More Races"))}
                }
            },
            "White": {"count": get_value("Estimate!!RACE!!Total population!!One race!!White"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!White"), total_population)},
            "Black or African American": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Black or African American"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Black or African American"), total_population)},
            "American Indian and Alaska Native": {
                "count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"),
                "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"), total_population),
                "Ethnicities": {
                    "Aztec": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Aztec"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Aztec"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))},
                    "Blackfeet Tribe of the Blackfeet Indian Reservation of Montana": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Blackfeet Tribe of the Blackfeet Indian Reservation of Montana"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Blackfeet Tribe of the Blackfeet Indian Reservation of Montana"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))},
                    "Maya": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Maya"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Maya"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))},
                    "Native Village of Barrow Inupiat Traditional Government": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Native Village of Barrow Inupiat Traditional Government"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Native Village of Barrow Inupiat Traditional Government"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))},
                    "Navajo Nation": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Navajo Nation"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Navajo Nation"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))},
                    "Nome Eskimo Community": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Nome Eskimo Community"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Nome Eskimo Community"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))},
                    "Other American Indian and Alaska Native": {"count": get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Other American Indian and Alaska Native"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native!!Other American Indian and Alaska Native"), get_value("Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native"))}
                }
            },
            "Asian": {
                "count": get_value("Estimate!!RACE!!Total population!!One race!!Asian"),
                "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian"), total_population),
                "Ethnicities": {
                    "Asian Indian": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Asian Indian"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Asian Indian"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))},
                    "Chinese": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Chinese"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Chinese"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))},
                    "Filipino": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Filipino"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Filipino"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))},
                    "Japanese": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Japanese"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Japanese"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))},
                    "Korean": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Korean"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Korean"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))},
                    "Vietnamese": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Vietnamese"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Vietnamese"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))},
                    "Other Asian": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Other Asian"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Asian!!Other Asian"), get_value("Estimate!!RACE!!Total population!!One race!!Asian"))}
                }
            },
            "Native Hawaiian and Other Pacific Islander": {
                "count": get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander"),
                "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander"), total_population),
                "Ethnicities": {
                    "Native Hawaiian": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Native Hawaiian"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Native Hawaiian"), get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander"))},
                    "Chamorro": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Chamorro"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Chamorro"), get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander"))},
                    "Samoan": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Samoan"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Samoan"), get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander"))},
                    "Other Native Hawaiian and Other Pacific Islander": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Other Native Hawaiian and Other Pacific Islander"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander!!Other Native Hawaiian and Other Pacific Islander"), get_value("Estimate!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander"))}
                }
            },
            "Some Other Race": {"count": get_value("Estimate!!RACE!!Total population!!One race!!Some Other Race"), "percentage": calculate_percentage(get_value("Estimate!!RACE!!Total population!!One race!!Some Other Race"), total_population)}
        },
        "HISPANIC OR LATINO AND RACE": {
            "Total population": total_population,
            "Hispanic or Latino (of any race)": {"count": get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"), "percentage": calculate_percentage(get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"), total_population)},
            "Not Hispanic or Latino": {"count": get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino"), "percentage": calculate_percentage(get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino"), total_population)},
            "Hispanic or Latino": {
                "Mexican": {"count": get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Mexican"), "percentage": calculate_percentage(get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Mexican"), get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"))},
                "Puerto Rican": {"count": get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Puerto Rican"), "percentage": calculate_percentage(get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Puerto Rican"), get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"))},
                "Cuban": {"count": get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Cuban"), "percentage": calculate_percentage(get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Cuban"), get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"))},
                "Other Hispanic or Latino": {"count": get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Other Hispanic or Latino"), "percentage": calculate_percentage(get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Other Hispanic or Latino"), get_value("Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"))}
            }
        },
    }
    return result

# Create the structured data
data = create_structured_data(data_df)

## Sex

In [4]:
sex_data = data['SEX AND AGE']

male_percentage = sex_data['18 years and over Male']['percentage']
female_percentage = sex_data['18 years and over Female']['percentage']

# Create sex probabilities
sex_probabilities = [male_percentage / 100.0, female_percentage / 100.0]
sex_categories = ['Male', 'Female']
sex_probs_tensor = torch.tensor(sex_probabilities)

# Create Categorical distribution for Sex
sex_distribution = torch.distributions.Categorical(probs=sex_probs_tensor)

print("Sex categories:", sex_categories)
print("Sex probabilities:", sex_probabilities)
print("Sex distribution:", sex_distribution.probs)

NameError: name 'torch' is not defined

## Age

In [None]:
age_data = data['SEX AND AGE']['Age Groups']
age_categories = list(age_data.keys())
age_probabilities = [age_data[category]['percentage'] / 100.0 for category in age_categories]
age_probs_tensor = torch.tensor(age_probabilities)

# Create Categorical distribution for Age
age_distribution = torch.distributions.Categorical(probs=age_probs_tensor)
print("Age categories:", age_categories)
print("Age probabilities:", age_probabilities, "sum:", sum(age_probabilities))
print("Age distribution:", age_distribution.probs)

Age categories: ['18 to 19 years', '20 to 24 years', '25 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 59 years', '60 to 64 years', '65 to 74 years', '75 to 84 years', '85 years and over']
Age probabilities: [0.025699999999999997, 0.0789, 0.175, 0.1634, 0.1515, 0.08199999999999999, 0.0842, 0.1334, 0.0726, 0.0268] sum: 0.9934999999999999
Age distribution: tensor([0.0259, 0.0794, 0.1761, 0.1645, 0.1525, 0.0825, 0.0848, 0.1343, 0.0731,
        0.0270])


## Race

In [None]:
race_data = data['RACE']
main_race_categories = ['White', 'Black or African American', 'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some Other Race', 'Two or More Races']

race_counts = []
race_percentages = []
race_categories = []

for category in main_race_categories:
    if category in race_data:
        count = race_data[category]['count']
        percentage = race_data[category]['percentage']
        race_counts.append(count)
        race_percentages.append(percentage)
        race_categories.append(category)
    else:
        print(f"Category {category} not found in data")

# Normalize race percentages to sum to 1
total_race_percentage = sum(race_percentages)
race_probabilities = [p / total_race_percentage for p in race_percentages]
race_probs_tensor = torch.tensor(race_probabilities)

# Create Categorical distribution for Race
race_distribution = torch.distributions.Categorical(probs=race_probs_tensor)

print("\nRace categories:", race_categories)
print("Race probabilities:", race_probabilities)
print("Race distribution:", race_distribution.probs)

# Process subcategories for races with ethnicities
# For 'American Indian and Alaska Native'
ai_an_ethnicities = race_data['American Indian and Alaska Native']['Ethnicities']

ai_an_ethnicities_list = []
ai_an_percentages = []

for ethnicity, values in ai_an_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    ai_an_ethnicities_list.append(ethnicity)
    ai_an_percentages.append(percentage)

# Normalize percentages
total_ai_an_percentage = sum(ai_an_percentages)
ai_an_probabilities = [p / total_ai_an_percentage for p in ai_an_percentages]
ai_an_probs_tensor = torch.tensor(ai_an_probabilities)

# Create Categorical distribution
ai_an_distribution = torch.distributions.Categorical(probs=ai_an_probs_tensor)

print("\nAmerican Indian and Alaska Native Ethnicities:", ai_an_ethnicities_list)
print("Probabilities:", ai_an_probabilities)
print("Distribution:", ai_an_distribution.probs)

# For 'Asian'
asian_ethnicities = race_data['Asian']['Ethnicities']

asian_ethnicities_list = []
asian_percentages = []

for ethnicity, values in asian_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    asian_ethnicities_list.append(ethnicity)
    asian_percentages.append(percentage)

# Normalize percentages
total_asian_percentage = sum(asian_percentages)
asian_probabilities = [p / total_asian_percentage for p in asian_percentages]
asian_probs_tensor = torch.tensor(asian_probabilities)

# Create Categorical distribution
asian_distribution = torch.distributions.Categorical(probs=asian_probs_tensor)

print("\nAsian Ethnicities:", asian_ethnicities_list)
print("Probabilities:", asian_probabilities)
print("Distribution:", asian_distribution.probs)

# For 'Native Hawaiian and Other Pacific Islander'
nhopi_ethnicities = race_data['Native Hawaiian and Other Pacific Islander']['Ethnicities']

nhopi_ethnicities_list = []
nhopi_percentages = []

for ethnicity, values in nhopi_ethnicities.items():
    count = values['count']
    percentage = values['percentage']
    nhopi_ethnicities_list.append(ethnicity)
    nhopi_percentages.append(percentage)

# Normalize percentages
total_nhopi_percentage = sum(nhopi_percentages)
nhopi_probabilities = [p / total_nhopi_percentage for p in nhopi_percentages]
nhopi_probs_tensor = torch.tensor(nhopi_probabilities)

# Create Categorical distribution
nhopi_distribution = torch.distributions.Categorical(probs=nhopi_probs_tensor)

print("\nNative Hawaiian and Other Pacific Islander Ethnicities:", nhopi_ethnicities_list)
print("Probabilities:", nhopi_probabilities)
print("Distribution:", nhopi_distribution.probs)

# For 'Two or More Races'
two_or_more_races = race_data['Two or More Races']['Combinations']

combinations_list = []
combinations_percentages = []

for combination, values in two_or_more_races.items():
    count = values['count']
    percentage = values['percentage']
    combinations_list.append(combination)
    combinations_percentages.append(percentage)

# Normalize percentages
total_combinations_percentage = sum(combinations_percentages)
combinations_probabilities = [p / total_combinations_percentage for p in combinations_percentages]
combinations_probs_tensor = torch.tensor(combinations_probabilities)

# Create Categorical distribution
combinations_distribution = torch.distributions.Categorical(probs=combinations_probs_tensor)

print("\nTwo or More Races Combinations:", combinations_list)
print("Probabilities:", combinations_probabilities)
print("Distribution:", combinations_distribution.probs)

# Process 'HISPANIC OR LATINO AND RACE'
hispanic_data = data['HISPANIC OR LATINO AND RACE']
hispanic_categories = ['Hispanic or Latino (of any race)', 'Not Hispanic or Latino']

hispanic_counts = []
hispanic_percentages = []
hispanic_categories_list = []

for category in hispanic_categories:
    if category in hispanic_data:
        count = hispanic_data[category]['count']
        percentage = hispanic_data[category]['percentage']
        hispanic_counts.append(count)
        hispanic_percentages.append(percentage)
        hispanic_categories_list.append(category)

# Normalize percentages
total_hispanic_percentage = sum(hispanic_percentages)
hispanic_probabilities = [p / total_hispanic_percentage for p in hispanic_percentages]
hispanic_probs_tensor = torch.tensor(hispanic_probabilities)

# Create Categorical distribution
hispanic_distribution = torch.distributions.Categorical(probs=hispanic_probs_tensor)

print("\nHispanic or Latino Categories:", hispanic_categories_list)
print("Probabilities:", hispanic_probabilities)
print("Distribution:", hispanic_distribution.probs)

# Process 'Hispanic or Latino' subcategories
hispanic_subcategories = hispanic_data['Hispanic or Latino']

hispanic_subcategories_list = []
hispanic_sub_percentages = []

for subcategory, values in hispanic_subcategories.items():
    if subcategory != 'count' and subcategory != 'percentage':
        count = values['count']
        percentage = values['percentage']
        hispanic_subcategories_list.append(subcategory)
        hispanic_sub_percentages.append(percentage)

# Normalize percentages
total_hispanic_sub_percentage = sum(hispanic_sub_percentages)
hispanic_sub_probabilities = [p / total_hispanic_sub_percentage for p in hispanic_sub_percentages]
hispanic_sub_probs_tensor = torch.tensor(hispanic_sub_probabilities)

# Create Categorical distribution
hispanic_sub_distribution = torch.distributions.Categorical(probs=hispanic_sub_probs_tensor)

print("\nHispanic or Latino Subcategories:", hispanic_subcategories_list)
print("Probabilities:", hispanic_sub_probabilities)
print("Distribution:", hispanic_sub_distribution.probs)


Race categories: ['White', 'Black or African American', 'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some Other Race', 'Two or More Races']
Race probabilities: [0.5509550955095509, 0.14311431143114312, 0.0067006700670067, 0.0912091209120912, 0.0004000400040004, 0.1028102810281028, 0.10481048104810481]
Race distribution: tensor([5.5096e-01, 1.4311e-01, 6.7007e-03, 9.1209e-02, 4.0004e-04, 1.0281e-01,
        1.0481e-01])

American Indian and Alaska Native Ethnicities: ['Aztec', 'Blackfeet Tribe of the Blackfeet Indian Reservation of Montana', 'Maya', 'Native Village of Barrow Inupiat Traditional Government', 'Navajo Nation', 'Nome Eskimo Community', 'Other American Indian and Alaska Native']
Probabilities: [0.19468053194680532, 0.013298670132986704, 0.10498950104989502, 0.0, 0.006399360063993601, 0.0, 0.6806319368063194]
Distribution: tensor([0.1947, 0.0133, 0.1050, 0.0000, 0.0064, 0.0000, 0.6806])

Asian Ethnicities: ['Asian Indian', 'Chi

In [None]:
import pandas as pd

def sample_demographics():
    hispanic_status = hispanic_categories_list[hispanic_distribution.sample().item()]
    
    race = None
    ethnicity = None
    
    if hispanic_status == 'Hispanic or Latino (of any race)':
        hispanic_subtype = hispanic_subcategories_list[hispanic_sub_distribution.sample().item()]
        race = 'Hispanic'
        ethnicity = hispanic_subtype
    else:
        race = race_categories[race_distribution.sample().item()]
        
        if race == 'American Indian and Alaska Native':
            ethnicity = ai_an_ethnicities_list[ai_an_distribution.sample().item()]
        elif race == 'Asian':
            ethnicity = asian_ethnicities_list[asian_distribution.sample().item()]
        elif race == 'Native Hawaiian and Other Pacific Islander':
            ethnicity = nhopi_ethnicities_list[nhopi_distribution.sample().item()]
        elif race == 'Two or More Races':
            ethnicity = combinations_list[combinations_distribution.sample().item()]
    
    age_group = age_categories[age_distribution.sample().item()]
    sex = sex_categories[sex_distribution.sample().item()]

    return {
        'Race': race,
        'Ethnicity': ethnicity if ethnicity else race,
        'Age Group': age_group,
        'Sex': sex
    }

# Sample 100 demographic profiles
num_samples = 1000
samples = [sample_demographics() for _ in range(num_samples)]

# Create a DataFrame
df = pd.DataFrame(samples)

# Display the DataFrame
print(df)

                Race        Ethnicity       Age Group     Sex
0              Asian     Asian Indian  45 to 54 years    Male
1              White            White  35 to 44 years  Female
2           Hispanic            Cuban  35 to 44 years  Female
3              White            White  35 to 44 years    Male
4              White            White  35 to 44 years    Male
..               ...              ...             ...     ...
995  Some Other Race  Some Other Race  65 to 74 years    Male
996            White            White  65 to 74 years    Male
997            White            White  35 to 44 years  Female
998            White            White  60 to 64 years  Female
999            White            White  35 to 44 years  Female

[1000 rows x 4 columns]


In [None]:
# Calculate probabilities from the sampled data
sampled_probabilities = {
    'Race': df['Race'].value_counts(normalize=True),
    'Age Group': df['Age Group'].value_counts(normalize=True),
    'Sex': df['Sex'].value_counts(normalize=True)
}

# Compare with original probabilities
print("Race probabilities:")
print("Original:", dict(zip(race_categories, race_probabilities)))
print("Sampled:", sampled_probabilities['Race'].to_dict())
print()

print("Age Group probabilities:")
print("Original:", dict(zip(age_categories, age_probabilities)))
print("Sampled:", sampled_probabilities['Age Group'].to_dict())
print()

print("Sex probabilities:")
print("Original:", dict(zip(sex_categories, sex_probabilities)))
print("Sampled:", sampled_probabilities['Sex'].to_dict())
print()

# Calculate Hispanic proportion
hispanic_proportion = (df['Race'] == 'Hispanic').mean()
print("Hispanic proportion:")
print("Original:", hispanic_probabilities[0])
print("Sampled:", hispanic_proportion)
print()

# For Hispanic subtypes
if 'Hispanic' in df['Race'].unique():
    hispanic_subtypes = df[df['Race'] == 'Hispanic']['Ethnicity'].value_counts(normalize=True)
    print("Hispanic subtypes probabilities:")
    print("Original:", dict(zip(hispanic_subcategories_list, hispanic_sub_probabilities)))
    print("Sampled:", hispanic_subtypes.to_dict())

Race probabilities:
Original: {'White': 0.5509550955095509, 'Black or African American': 0.14311431143114312, 'American Indian and Alaska Native': 0.0067006700670067, 'Asian': 0.0912091209120912, 'Native Hawaiian and Other Pacific Islander': 0.0004000400040004, 'Some Other Race': 0.1028102810281028, 'Two or More Races': 0.10481048104810481}
Sampled: {'White': 0.479, 'Hispanic': 0.183, 'Black or African American': 0.107, 'Two or More Races': 0.08, 'Some Other Race': 0.08, 'Asian': 0.066, 'American Indian and Alaska Native': 0.004, 'Native Hawaiian and Other Pacific Islander': 0.001}

Age Group probabilities:
Original: {'18 to 19 years': 0.025699999999999997, '20 to 24 years': 0.0789, '25 to 34 years': 0.175, '35 to 44 years': 0.1634, '45 to 54 years': 0.1515, '55 to 59 years': 0.08199999999999999, '60 to 64 years': 0.0842, '65 to 74 years': 0.1334, '75 to 84 years': 0.0726, '85 years and over': 0.0268}
Sampled: {'35 to 44 years': 0.164, '25 to 34 years': 0.162, '45 to 54 years': 0.145, 