In [1]:
import requests
import pandas as pd

# Update the URL to fetch S2301 data
data_url = "https://api.census.gov/data/2023/acs/acs1/subject?get=group(S2401)&ucgid=0400000US36"
response = requests.get(data_url)
raw_data = response.json()

metadata_url = "https://api.census.gov/data/2016/acs/acs5/subject/groups/S2401.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

# Update the mapping to filter for S2301 variables
s2401_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('S2401'):  # Only S2301 group variables
        s2401_mapping[var_code] = var_info['label']

variable_codes = raw_data[0]
values = raw_data[1]

data_df = pd.DataFrame({
    "Variable Code": variable_codes,
    "Value": values
})

data_df['Label'] = data_df['Variable Code'].map(s2401_mapping)

data_df = data_df[['Label', 'Value']].dropna()

In [3]:
import requests
import pandas as pd
import os
import us

def get_state_data(state_fips):
    data_url = f"https://api.census.gov/data/2023/acs/acs1/subject?get=group(S2401)&for=state:{state_fips}"
    response = requests.get(data_url)
    return response.json()

metadata_url = "https://api.census.gov/data/2016/acs/acs5/subject/groups/S2401.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

s2401_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('S2401'):  
        s2401_mapping[var_code] = var_info['label']

# Ensure the raw data directory exists
raw_data_dir = "/user/al4263/Simulate/Persona/data/s2401/raw_data"
os.makedirs(raw_data_dir, exist_ok=True)

for state in us.states.STATES:
    print(f"Processing {state.name}...")
    
    raw_data = get_state_data(state.fips)
    
    variable_codes = raw_data[0]
    values = raw_data[1]

    data_df = pd.DataFrame({
        "Variable Code": variable_codes,
        "Value": values
    })

    data_df['Label'] = data_df['Variable Code'].map(s2401_mapping)
    data_df = data_df[['Label', 'Value']].dropna()
    
    # Save the data with state-specific filename
    filename = f"{state.abbr.lower()}_s2401.csv"
    data_df.to_csv(os.path.join(raw_data_dir, filename), index=False)
    print(f"Saved {filename}")

print("All states processed successfully!")

Processing Alabama...
Saved al_s2401.csv
Processing Alaska...
Saved ak_s2401.csv
Processing Arizona...
Saved az_s2401.csv
Processing Arkansas...
Saved ar_s2401.csv
Processing California...
Saved ca_s2401.csv
Processing Colorado...
Saved co_s2401.csv
Processing Connecticut...
Saved ct_s2401.csv
Processing Delaware...
Saved de_s2401.csv
Processing Florida...
Saved fl_s2401.csv
Processing Georgia...
Saved ga_s2401.csv
Processing Hawaii...
Saved hi_s2401.csv
Processing Idaho...
Saved id_s2401.csv
Processing Illinois...
Saved il_s2401.csv
Processing Indiana...
Saved in_s2401.csv
Processing Iowa...
Saved ia_s2401.csv
Processing Kansas...
Saved ks_s2401.csv
Processing Kentucky...
Saved ky_s2401.csv
Processing Louisiana...
Saved la_s2401.csv
Processing Maine...
Saved me_s2401.csv
Processing Maryland...
Saved md_s2401.csv
Processing Massachusetts...
Saved ma_s2401.csv
Processing Michigan...
Saved mi_s2401.csv
Processing Minnesota...
Saved mn_s2401.csv
Processing Mississippi...
Saved ms_s2401.cs

In [58]:
import pandas as pd
import json

def create_occupational_data(df):
    def get_value(label):
        row = df[df['Label'] == label]
        if not row.empty:
            value = row['Value'].values[0]
            if isinstance(value, str) and value.strip() == '-999999999':
                return None
            return value
        return None

    def safe_convert(value, convert_func, default=None):
        if value is None:
            return default
        try:
            return convert_func(value)
        except (ValueError, TypeError):
            return default

    def process_occupation(prefix, occupation):
        return safe_convert(get_value(f"{prefix}!!Estimate!!{occupation}"), int, 0)

    def create_gender_data(gender):
        return {
            "Total": process_occupation(gender, "Civilian employed population 16 years and over"),
            "Management, business, science, and arts occupations": {
                "Total": process_occupation(gender, "Management, business, science, and arts occupations"),
                "Subcategories": {
                    "Management, business, and financial occupations": {
                        "Total": process_occupation(gender, "Management, business, science, and arts occupations!!Management, business, and financial occupations"),
                        "Subcategories": {
                            "Management occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Management, business, and financial occupations!!Management occupations"),
                            "Business and financial operations occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Management, business, and financial occupations!!Business and financial operations occupations")
                        }
                    },
                    "Computer, engineering, and science occupations": {
                        "Total": process_occupation(gender, "Management, business, science, and arts occupations!!Computer, engineering, and science occupations"),
                        "Subcategories": {
                            "Computer and mathematical occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Computer, engineering, and science occupations!!Computer and mathematical occupations"),
                            "Architecture and engineering occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Computer, engineering, and science occupations!!Architecture and engineering occupations"),
                            "Life, physical, and social science occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Computer, engineering, and science occupations!!Life, physical, and social science occupations")
                        }
                    },
                    "Education, legal, community service, arts, and media occupations": {
                        "Total": process_occupation(gender, "Management, business, science, and arts occupations!!Education, legal, community service, arts, and media occupations"),
                        "Subcategories": {
                            "Community and social services occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Education, legal, community service, arts, and media occupations!!Community and social services occupations"),
                            "Legal occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Education, legal, community service, arts, and media occupations!!Legal occupations"),
                            "Education, training, and library occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Education, legal, community service, arts, and media occupations!!Education, training, and library occupations"),
                            "Arts, design, entertainment, sports, and media occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Education, legal, community service, arts, and media occupations!!Arts, design, entertainment, sports, and media occupations")
                        }
                    },
                    "Healthcare practitioner and technical occupations": {
                        "Total": process_occupation(gender, "Management, business, science, and arts occupations!!Healthcare practitioner and technical occupations"),
                        "Subcategories": {
                            "Health diagnosing and treating practitioners and other technical occupations": process_occupation(gender, "Management, business, science, and arts occupations!!Healthcare practitioner and technical occupations!!Health diagnosing and treating practitioners and other technical occupations"),
                            "Health technologists and technicians": process_occupation(gender, "Management, business, science, and arts occupations!!Healthcare practitioner and technical occupations!!Health technologists and technicians")
                        }
                    }
                }
            },
            "Service occupations": {
                "Total": process_occupation(gender, "Service occupations"),
                "Subcategories": {
                    "Healthcare support occupations": {
                        "Total": process_occupation(gender, "Service occupations!!Healthcare support occupations"),
                        "Subcategories": {}
                    },
                    "Protective service occupations": {
                        "Total": process_occupation(gender, "Service occupations!!Protective service occupations"),
                        "Subcategories": {
                            "Fire fighting and prevention, and other protective service workers including supervisors": process_occupation(gender, "Service occupations!!Protective service occupations!!Fire fighting and prevention, and other protective service workers including supervisors"),
                            "Law enforcement workers including supervisors": process_occupation(gender, "Service occupations!!Protective service occupations!!Law enforcement workers including supervisors")
                        }
                    },
                    "Food preparation and serving related occupations": {
                        "Total": process_occupation(gender, "Service occupations!!Food preparation and serving related occupations"),
                        "Subcategories": {}
                    },
                    "Building and grounds cleaning and maintenance occupations": {
                        "Total": process_occupation(gender, "Service occupations!!Building and grounds cleaning and maintenance occupations"),
                        "Subcategories": {}
                    },
                    "Personal care and service occupations": {
                        "Total": process_occupation(gender, "Service occupations!!Personal care and service occupations"),
                        "Subcategories": {}
                    }
                }
            },
            "Sales and office occupations": {
                "Total": process_occupation(gender, "Sales and office occupations"),
                "Subcategories": {
                    "Sales and related occupations": {
                        "Total": process_occupation(gender, "Sales and office occupations!!Sales and related occupations"),
                        "Subcategories": {}
                    },
                    "Office and administrative support occupations": {
                        "Total": process_occupation(gender, "Sales and office occupations!!Office and administrative support occupations"),
                        "Subcategories": {}
                    }
                }
            },
            "Natural resources, construction, and maintenance occupations": {
                "Total": process_occupation(gender, "Natural resources, construction, and maintenance occupations"),
                "Subcategories": {
                    "Farming, fishing, and forestry occupations": {
                        "Total": process_occupation(gender, "Natural resources, construction, and maintenance occupations!!Farming, fishing, and forestry occupations"),
                        "Subcategories": {}
                    },
                    "Construction and extraction occupations": {
                        "Total": process_occupation(gender, "Natural resources, construction, and maintenance occupations!!Construction and extraction occupations"),
                        "Subcategories": {}
                    },
                    "Installation, maintenance, and repair occupations": {
                        "Total": process_occupation(gender, "Natural resources, construction, and maintenance occupations!!Installation, maintenance, and repair occupations"),
                        "Subcategories": {}
                    }
                }
            },
            "Production, transportation, and material moving occupations": {
                "Total": process_occupation(gender, "Production, transportation, and material moving occupations"),
                "Subcategories": {
                    "Production occupations": {
                        "Total": process_occupation(gender, "Production, transportation, and material moving occupations!!Production occupations"),
                        "Subcategories": {}
                    },
                    "Transportation occupations": {
                        "Total": process_occupation(gender, "Production, transportation, and material moving occupations!!Transportation occupations"),
                        "Subcategories": {}
                    },
                    "Material moving occupations": {
                        "Total": process_occupation(gender, "Production, transportation, and material moving occupations!!Material moving occupations"),
                        "Subcategories": {}
                    }
                }
            }
        }

    result = {
        "Male": create_gender_data("Male"),
        "Female": create_gender_data("Female")
    }
    
    return result


In [59]:
# do this foimport os
import pandas as pd
import json
import os

# Directory containing the raw data files
raw_data_dir = "/user/al4263/Simulate/Persona/data/s2401/raw_data"

# Output directory for structured data
output_dir = "/user/al4263/Simulate/Persona/data/s2401/structured_data"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process each state file
for filename in os.listdir(raw_data_dir):
    if filename.endswith("_s2401.csv"):
        state = filename.split("_")[0]
        
        # Read the CSV file
        data_df = pd.read_csv(os.path.join(raw_data_dir, filename))
        
        # Create the structured data
        structured_data = create_occupational_data(data_df)
        
        # Save the structured data as JSON
        output_filename = f"{state}_s2401_structured.json"
        with open(os.path.join(output_dir, output_filename), 'w') as f:
            json.dump(structured_data, f, indent=2)
        
        print(f"Processed and saved structured data for {state}")

print("All states processed successfully!")

Processed and saved structured data for ut
Processed and saved structured data for va
Processed and saved structured data for sc
Processed and saved structured data for nm
Processed and saved structured data for mt
Processed and saved structured data for co
Processed and saved structured data for ak
Processed and saved structured data for me
Processed and saved structured data for tn
Processed and saved structured data for in
Processed and saved structured data for mi
Processed and saved structured data for de
Processed and saved structured data for nc
Processed and saved structured data for nd
Processed and saved structured data for ct
Processed and saved structured data for id
Processed and saved structured data for al
Processed and saved structured data for la
Processed and saved structured data for hi
Processed and saved structured data for pa
Processed and saved structured data for vt
Processed and saved structured data for ne
Processed and saved structured data for oh
Processed a

## HouseHold

## Employment

In [68]:
import torch
from get_dist.get_sex_race_age_dist import load_structured_data
from collections import namedtuple

CategoricalDistribution = namedtuple('CategoricalDistribution', ['categories', 'distribution'])

def create_distribution(data):
    total = sum(data.values())
    categories = list(data.keys())
    probabilities = torch.tensor([value / total for value in data.values()])
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def print_distribution(dist, indent=""):
    if dist:
        for category, prob in zip(dist.categories, dist.distribution.probs):
            print(f"{indent}{category}: {prob.item():.4f}")

def create_and_print_distributions(data, indent=""):
    if isinstance(data, dict):
        if 'Total' in data and 'Subcategories' in data:
            print(f"{indent}{data['Total']}")
            create_and_print_distributions(data['Subcategories'], indent + "  ")
        else:
            dist = create_distribution(data)
            print_distribution(dist, indent)
            for category, value in data.items():
                if isinstance(value, dict):
                    print(f"{indent}{category} ->")
                    create_and_print_distributions(value, indent + "  ")

# Usage example:
state = us.states.CO
s2401 = f"/user/al4263/Simulate/Persona/data/s2401/structured_data/{state.abbr.lower()}_s2401_structured.json"
occupation_data = load_structured_data(s2401)

for gender in ['Male', 'Female']:
    print(f"\n\nDistributions for {gender}:")
    create_and_print_distributions(occupation_data[gender])



Distributions for Male:


TypeError: unsupported operand type(s) for +: 'int' and 'dict'

In [74]:
import torch
from get_dist.get_sex_race_age_dist import load_structured_data
from collections import namedtuple

CategoricalDistribution = namedtuple('CategoricalDistribution', ['categories', 'distribution'])

def create_distribution(data):
    total = sum(data.values())
    categories = list(data.keys())
    probabilities = torch.tensor([value / total for value in data.values()])
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def sample_career(gender_data):
    # Sample main category
    main_categories = {
        "Management, business, science, and arts occupations": gender_data["Management, business, science, and arts occupations"]["Total"],
        "Service occupations": gender_data["Service occupations"]["Total"],
        "Sales and office occupations": gender_data["Sales and office occupations"]["Total"],
        "Natural resources, construction, and maintenance occupations": gender_data["Natural resources, construction, and maintenance occupations"]["Total"],
        "Production, transportation, and material moving occupations": gender_data["Production, transportation, and material moving occupations"]["Total"]
    }
    main_dist = create_distribution(main_categories)
    main_category = main_dist.categories[main_dist.distribution.sample()]

    if main_category == "Management, business, science, and arts occupations":
        mbsa_data = gender_data[main_category]["Subcategories"]
        mbsa_categories = {k: v["Total"] for k, v in mbsa_data.items()}
        mbsa_dist = create_distribution(mbsa_categories)
        mbsa_subcategory = mbsa_dist.categories[mbsa_dist.distribution.sample()]

        if mbsa_subcategory == "Management, business, and financial occupations":
            mbf_data = mbsa_data[mbsa_subcategory]["Subcategories"]
            mbf_dist = create_distribution(mbf_data)
            return mbf_dist.categories[mbf_dist.distribution.sample()]
        elif mbsa_subcategory == "Computer, engineering, and science occupations":
            ces_data = mbsa_data[mbsa_subcategory]["Subcategories"]
            ces_dist = create_distribution(ces_data)
            return ces_dist.categories[ces_dist.distribution.sample()]
        elif mbsa_subcategory == "Education, legal, community service, arts, and media occupations":
            elcsam_data = mbsa_data[mbsa_subcategory]["Subcategories"]
            elcsam_dist = create_distribution(elcsam_data)
            return elcsam_dist.categories[elcsam_dist.distribution.sample()]
        elif mbsa_subcategory == "Healthcare practitioner and technical occupations":
            hpt_data = mbsa_data[mbsa_subcategory]["Subcategories"]
            hpt_dist = create_distribution(hpt_data)
            return hpt_dist.categories[hpt_dist.distribution.sample()]

    elif main_category == "Service occupations":
        service_data = gender_data[main_category]["Subcategories"]
        service_categories = {k: v["Total"] for k, v in service_data.items()}
        service_dist = create_distribution(service_categories)
        service_subcategory = service_dist.categories[service_dist.distribution.sample()]

        if service_subcategory == "Protective service occupations":
            ps_data = service_data[service_subcategory]["Subcategories"]
            ps_dist = create_distribution(ps_data)
            return ps_dist.categories[ps_dist.distribution.sample()]
        else:
            return service_subcategory

    elif main_category == "Sales and office occupations":
        so_data = gender_data[main_category]["Subcategories"]
        so_categories = {k: v["Total"] for k, v in so_data.items()}
        so_dist = create_distribution(so_categories)
        return so_dist.categories[so_dist.distribution.sample()]

    elif main_category == "Natural resources, construction, and maintenance occupations":
        nrcm_data = gender_data[main_category]["Subcategories"]
        nrcm_categories = {k: v["Total"] for k, v in nrcm_data.items()}
        nrcm_dist = create_distribution(nrcm_categories)
        return nrcm_dist.categories[nrcm_dist.distribution.sample()]

    elif main_category == "Production, transportation, and material moving occupations":
        ptmm_data = gender_data[main_category]["Subcategories"]
        ptmm_categories = {k: v["Total"] for k, v in ptmm_data.items()}
        ptmm_dist = create_distribution(ptmm_categories)
        return ptmm_dist.categories[ptmm_dist.distribution.sample()]

# Usage example:
state = us.states.CO
s2401 = f"/user/al4263/Simulate/Persona/data/s2401/structured_data/{state.abbr.lower()}_s2401_structured.json"
occupation_data = load_structured_data(s2401)

# Sample careers for each gender
samples = 5
for gender in ['Male', 'Female']:
    print(f"\n\nSampling {samples} careers for {gender}:")
    for i in range(samples):
        sampled_career = sample_career(occupation_data[gender])
        print(f"Sample {i+1}: {sampled_career}")



Sampling 5 careers for Male:
Sample 1: Installation, maintenance, and repair occupations
Sample 2: Management occupations
Sample 3: Material moving occupations
Sample 4: Sales and related occupations
Sample 5: Legal occupations


Sampling 5 careers for Female:
Sample 1: Architecture and engineering occupations
Sample 2: Sales and related occupations
Sample 3: Healthcare support occupations
Sample 4: Management occupations
Sample 5: Business and financial operations occupations


In [101]:
import os
import json
import pandas as pd
from tqdm import tqdm
import us
from get_dist.get_sex_race_age_dist import load_structured_data, get_sex_dist, get_age_dist, get_race_dist, sample_demographics
from get_dist.get_household_dist import sample_household_type, sample_relationship
from get_dist.get_marital_status_dist import sample_marital_status_by_gender
from get_dist.get_veteran_dist import sample_veteran_status
from get_dist.get_language_dist import sample_languages
from get_dist.get_edu_dist import sample_education_level
from get_dist.get_birth_dist import sample_birth_and_citizenship_multiple
from get_dist.get_employment_dist import sample_labor_force_and_employment

def generate_personas_for_state(state):
    # Load data
    dp05 = f"/user/al4263/Simulate/Persona/data/DP05/structured_data/{state.abbr.lower()}_structured_data.json"
    dp02 = f"/user/al4263/Simulate/Persona/data/DP02/structured_data/{state.abbr.lower()}_dp02_structured.json"

    s2301 = f"/user/al4263/Simulate/Persona/data/s2301/structured_data/{state.abbr.lower()}_s2301_structured.json"

    dp05_data = load_structured_data(dp05)
    dp02_data = load_structured_data(dp02)
    s2301_data = load_structured_data(s2301)

    # Get distributions
    age_distributions = get_age_dist(dp05_data)
    sex_distributions = get_sex_dist(dp05_data)
    race_distributions = get_race_dist(dp05_data)

    # Sample demographics
    num_samples = 1000
    samples = [sample_demographics(age_distributions, sex_distributions, race_distributions) for _ in range(num_samples)]

    # Create DataFrame
    df = pd.DataFrame(samples)

    # Add HOUSEHOLD_RELATIONSHIP
    df['HOUSEHOLD_RELATIONSHIP'] = df.apply(lambda row: sample_relationship(dp02_data, dp05_data), axis=1)

    # Sample household type for primary householders
    def sample_household_type_for_primary(row):
        if row['HOUSEHOLD_RELATIONSHIP'] == 'Primary Householder':
            return sample_household_type(dp02_data, row['Sex'])
        return None

    df['HOUSEHOLD_TYPE'] = df.apply(sample_household_type_for_primary, axis=1)

    # Sample additional attributes
    df['MARITAL_STATUS'] = df.apply(lambda row: sample_marital_status_by_gender(dp02_data, row['Sex']), axis=1)
    df['VETERAN_STATUS'] = df.apply(lambda row: sample_veteran_status(dp02_data), axis=1)
    df['LANGUAGE'], df['ENGLISH_PROFICIENCY'] = zip(*df.apply(lambda row: sample_languages(dp02_data), axis=1))
    df['EDUCATION'] = df.apply(lambda row: sample_education_level(dp02_data), axis=1)
    df['BIRTH_PLACE'], df['CITIZENSHIP'], df['BIRTH_DETAIL'] = zip(*df.apply(lambda row: sample_birth_and_citizenship_multiple(dp02_data), axis=1))

    # Sample employment status
    df['Labor Force Status'], df['Employment Status'] = zip(*df['Age'].apply(lambda x: sample_labor_force_and_employment(x, s2301_data)))

    # Add state information
    df['STATE_NAME'] = state.name
    df['STATE_ABBR'] = state.abbr

    # upper case all the column names
    df.columns = df.columns.str.upper()

    # replace all null with "NOT APPLICABLE"
    df.fillna("Not Applicable", inplace=True)

    return df
state = us.states.CO
df = generate_personas_for_state(state)

In [110]:
def generate_career(row, occupation_data):
    if row['EMPLOYMENT STATUS'] == 'Employed':
        return sample_career(occupation_data[row['SEX']])
    else:
        return None

# Apply the generate_career function to each row in the DataFrame
df['CAREER'] = df.apply(lambda row: generate_career(row, occupation_data), axis=1)

# Display some results
print(df[['SEX', 'LABOR FORCE STATUS', 'EMPLOYMENT STATUS', 'CAREER']].head(10))

# Optional: Count of employed vs not employed
print("\nEmployment summary:")
print(df['CAREER'].value_counts())


      SEX  LABOR FORCE STATUS EMPLOYMENT STATUS  \
0    Male      In Labor Force          Employed   
1  Female      In Labor Force          Employed   
2  Female      In Labor Force          Employed   
3  Female      In Labor Force          Employed   
4    Male      In Labor Force          Employed   
5    Male      In Labor Force          Employed   
6    Male      In Labor Force          Employed   
7  Female      In Labor Force          Employed   
8  Female  Not in Labor Force    Not Applicable   
9  Female      In Labor Force          Employed   

                                             CAREER  
0                            Management occupations  
1                            Management occupations  
2  Food preparation and serving related occupations  
3                                 Legal occupations  
4  Food preparation and serving related occupations  
5             Computer and mathematical occupations  
6             Computer and mathematical occupations  
7     