In [22]:
import requests
import pandas as pd

# Update the URL to fetch S2301 data
data_url = "https://api.census.gov/data/2023/acs/acs1/subject?get=group(S2301)&ucgid=0400000US36"
response = requests.get(data_url)
raw_data = response.json()

metadata_url = "https://api.census.gov/data/2016/acs/acs5/subject/groups/S2301.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

# Update the mapping to filter for S2301 variables
s2301_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('S2301'):  # Only S2301 group variables
        s2301_mapping[var_code] = var_info['label']

variable_codes = raw_data[0]
values = raw_data[1]

data_df = pd.DataFrame({
    "Variable Code": variable_codes,
    "Value": values
})

data_df['Label'] = data_df['Variable Code'].map(s2301_mapping)

data_df = data_df[['Label', 'Value']].dropna()

In [39]:
import requests
import pandas as pd
import os
import us

def get_state_data(state_fips):
    data_url = f"https://api.census.gov/data/2023/acs/acs1/subject?get=group(S2301)&for=state:{state_fips}"
    response = requests.get(data_url)
    return response.json()

metadata_url = "https://api.census.gov/data/2016/acs/acs5/subject/groups/S2301.json"
metadata_response = requests.get(metadata_url)
variables = metadata_response.json()['variables']

s2301_mapping = {}
for var_code, var_info in variables.items():
    if var_code.startswith('S2301'):  
        s2301_mapping[var_code] = var_info['label']

# Ensure the raw data directory exists
raw_data_dir = "/user/al4263/Simulate/Persona/data/s2301/raw_data"
os.makedirs(raw_data_dir, exist_ok=True)

for state in us.states.STATES:
    print(f"Processing {state.name}...")
    
    raw_data = get_state_data(state.fips)
    
    variable_codes = raw_data[0]
    values = raw_data[1]

    data_df = pd.DataFrame({
        "Variable Code": variable_codes,
        "Value": values
    })

    data_df['Label'] = data_df['Variable Code'].map(s2301_mapping)
    data_df = data_df[['Label', 'Value']].dropna()
    
    # Save the data with state-specific filename
    filename = f"{state.abbr.lower()}_s2301.csv"
    data_df.to_csv(os.path.join(raw_data_dir, filename), index=False)
    print(f"Saved {filename}")

print("All states processed successfully!")

Processing Alabama...
Saved al_s2301.csv
Processing Alaska...
Saved ak_s2301.csv
Processing Arizona...
Saved az_s2301.csv
Processing Arkansas...
Saved ar_s2301.csv
Processing California...
Saved ca_s2301.csv
Processing Colorado...
Saved co_s2301.csv
Processing Connecticut...
Saved ct_s2301.csv
Processing Delaware...
Saved de_s2301.csv
Processing Florida...
Saved fl_s2301.csv
Processing Georgia...
Saved ga_s2301.csv
Processing Hawaii...
Saved hi_s2301.csv
Processing Idaho...
Saved id_s2301.csv
Processing Illinois...
Saved il_s2301.csv
Processing Indiana...
Saved in_s2301.csv
Processing Iowa...
Saved ia_s2301.csv
Processing Kansas...
Saved ks_s2301.csv
Processing Kentucky...
Saved ky_s2301.csv
Processing Louisiana...
Saved la_s2301.csv
Processing Maine...
Saved me_s2301.csv
Processing Maryland...
Saved md_s2301.csv
Processing Massachusetts...
Saved ma_s2301.csv
Processing Michigan...
Saved mi_s2301.csv
Processing Minnesota...
Saved mn_s2301.csv
Processing Mississippi...
Saved ms_s2301.cs

In [40]:
import pandas as pd

def create_employment_statistics(df):
    def get_value(label):
        row = df[df['Label'] == label]
        if not row.empty:
            value = row['Value'].values[0]
            if isinstance(value, str) and value.strip() == '-999999999':
                return None
            return value
        return None

    def safe_convert(value, convert_func, default=None):
        if value is None:
            return default
        try:
            return convert_func(value)
        except (ValueError, TypeError):
            return default

    result = {
        "EMPLOYMENT STATISTICS": {
            "Population 16 years and over": {
                "Total": safe_convert(get_value("Total!!Estimate!!Population 16 years and over"), int),
                "Labor Force Participation Rate": safe_convert(get_value("Labor Force Participation Rate!!Estimate!!Population 16 years and over"), float),
                "Employment/Population Ratio": safe_convert(get_value("Employment/Population Ratio!!Estimate!!Population 16 years and over"), float),
                "Unemployment rate": safe_convert(get_value("Unemployment rate!!Estimate!!Population 16 years and over"), float)
            },
            "AGE": {
                age_group: {
                    "Total": safe_convert(get_value(f"Total!!Estimate!!AGE!!{age_group}"), int),
                    "Labor Force Participation Rate": safe_convert(get_value(f"Labor Force Participation Rate!!Estimate!!AGE!!{age_group}"), float),
                    "Employment/Population Ratio": safe_convert(get_value(f"Employment/Population Ratio!!Estimate!!AGE!!{age_group}"), float),
                    "Unemployment rate": safe_convert(get_value(f"Unemployment rate!!Estimate!!AGE!!{age_group}"), float)
                }
                for age_group in ["16 to 19 years", "20 to 24 years", "25 to 29 years", "30 to 34 years", "35 to 44 years", "45 to 54 years", "55 to 59 years", "60 to 64 years", "65 to 74 years", "75 years and over"]
            }
        }
    }
    
    return result

# Read the CSV file

# Create the structured data
employment_data = create_employment_statistics(data_df)

# You can now use the 'employment_data' dictionary for further processing or analysis

In [41]:
employment_data

{'EMPLOYMENT STATISTICS': {'Population 16 years and over': {'Total': 469355,
   'Labor Force Participation Rate': 63.5,
   'Employment/Population Ratio': 60.7,
   'Unemployment rate': 3.2},
  'AGE': {'16 to 19 years': {'Total': 30932,
    'Labor Force Participation Rate': 49.5,
    'Employment/Population Ratio': 40.9,
    'Unemployment rate': 16.7},
   '20 to 24 years': {'Total': 34570,
    'Labor Force Participation Rate': 82.1,
    'Employment/Population Ratio': 76.7,
    'Unemployment rate': 2.9},
   '25 to 29 years': {'Total': 36374,
    'Labor Force Participation Rate': 87.0,
    'Employment/Population Ratio': 80.7,
    'Unemployment rate': 4.4},
   '30 to 34 years': {'Total': 40084,
    'Labor Force Participation Rate': 84.1,
    'Employment/Population Ratio': 80.7,
    'Unemployment rate': 2.9},
   '35 to 44 years': {'Total': 82098,
    'Labor Force Participation Rate': 87.5,
    'Employment/Population Ratio': 84.6,
    'Unemployment rate': 1.9},
   '45 to 54 years': {'Total': 6

In [42]:
# do this foimport os
import pandas as pd
import json
import os

# Directory containing the raw data files
raw_data_dir = "/user/al4263/Simulate/Persona/data/s2301/raw_data"

# Output directory for structured data
output_dir = "/user/al4263/Simulate/Persona/data/s2301/structured_data"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process each state file
for filename in os.listdir(raw_data_dir):
    if filename.endswith("_s2301.csv"):
        state = filename.split("_")[0]
        
        # Read the CSV file
        data_df = pd.read_csv(os.path.join(raw_data_dir, filename))
        
        # Create the structured data
        structured_data = create_employment_statistics(data_df)
        
        # Save the structured data as JSON
        output_filename = f"{state}_s2301_structured.json"
        with open(os.path.join(output_dir, output_filename), 'w') as f:
            json.dump(structured_data, f, indent=2)
        
        print(f"Processed and saved structured data for {state}")

print("All states processed successfully!")

Processed and saved structured data for wy
Processed and saved structured data for id
Processed and saved structured data for wv
Processed and saved structured data for il
Processed and saved structured data for me
Processed and saved structured data for va
Processed and saved structured data for tn
Processed and saved structured data for nd
Processed and saved structured data for md
Processed and saved structured data for ca
Processed and saved structured data for oh
Processed and saved structured data for mi
Processed and saved structured data for ut
Processed and saved structured data for wa
Processed and saved structured data for ri
Processed and saved structured data for sd
Processed and saved structured data for vt
Processed and saved structured data for nm
Processed and saved structured data for de
Processed and saved structured data for in
Processed and saved structured data for ny
Processed and saved structured data for az
Processed and saved structured data for tx
Processed a

## HouseHold

## Employment

In [43]:
import numpy as np
import torch
from collections import namedtuple

CategoricalDistribution = namedtuple('CategoricalDistribution', ['categories', 'distribution'])

def create_labor_force_distribution(stats):
    categories = ['In Labor Force', 'Not in Labor Force']
    labor_force_rate = stats['Labor Force Participation Rate'] / 100
    probabilities = torch.tensor([labor_force_rate, 1 - labor_force_rate])
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def create_employment_distribution(stats):
    categories = ['Employed', 'Unemployed']
    employment_rate = 1 - (stats['Unemployment rate'] / 100)
    probabilities = torch.tensor([employment_rate, 1 - employment_rate])
    return CategoricalDistribution(categories=categories, 
                                   distribution=torch.distributions.Categorical(probs=probabilities))

def sample_labor_force_and_employment(age_group, data):
    # Map the age groups from df to the ones in employment_data
    age_group_mapping = {
        '18 to 19 years': '16 to 19 years',
        '20 to 24 years': '20 to 24 years',
        '25 to 34 years': '25 to 29 years',  # Use the first of the two relevant groups
        '35 to 44 years': '35 to 44 years',
        '45 to 54 years': '45 to 54 years',
        '55 to 59 years': '55 to 59 years',
        '60 to 64 years': '60 to 64 years',
        '65 to 74 years': '65 to 74 years',
        '75 to 84 years': '75 years and over',
        '85 years and over': '75 years and over'
    }
    
    mapped_age_group = age_group_mapping.get(age_group, age_group)
    
    stats = data['EMPLOYMENT STATISTICS']['AGE'][mapped_age_group]
    
    labor_force_dist = create_labor_force_distribution(stats)
    employment_dist = create_employment_distribution(stats)
    
    labor_force_status = labor_force_dist.categories[labor_force_dist.distribution.sample()]
    
    if labor_force_status == 'In Labor Force':
        employment_status = employment_dist.categories[employment_dist.distribution.sample()]
    else:
        employment_status = 'Not Applicable'
    
    return labor_force_status, employment_status

# Apply the sampling to the DataFrame

In [52]:
from get_dist.get_sex_race_age_dist import load_structured_data, get_sex_dist, get_age_dist, get_race_dist, sample_demographics

s2301 = f"/user/al4263/Simulate/Persona/data/s2301/structured_data/{state.abbr.lower()}_s2301_structured.json"

s2301_data = load_structured_data(s2301)

df['Labor Force Status'], df['Employment Status'] = zip(*df['Age'].apply(lambda x: sample_labor_force_and_employment(x, s2301_data)))

df

Unnamed: 0,Age,Sex,Race,Ethnicity,HOUSEHOLD_RELATIONSHIP,HOUSEHOLD_TYPE,MARITAL_STATUS,VETERAN_STATUS,LANGUAGE,ENGLISH_PROFICIENCY,EDUCATION,BIRTH_PLACE,CITIZENSHIP,BIRTH_DETAIL,STATE_NAME,STATE_ABBR,Labor Force Status,Employment Status
0,65 to 74 years,Female,Hispanic,Cuban,Other Relative of Householder,,"Now married, except separated",Non-Veteran,English only,Speak English well,"Some college, no degree",US Born,US Citizen,State of residence,Florida,FL,In Labor Force,Employed
1,20 to 24 years,Female,White,,Primary Householder,Married-couple Without kids,Widowed,Non-Veteran,English only,Speak English well,High school graduate,US Born,US Citizen,Different state,Florida,FL,In Labor Force,Employed
2,55 to 59 years,Female,Hispanic,Other Hispanic or Latino,Other Relative of Householder,,Divorced,Non-Veteran,English only,Speak English well,High school graduate,US Born,US Citizen,Different state,Florida,FL,Not in Labor Force,Not Applicable
3,55 to 59 years,Female,Some Other Race,,Child living with Parents,,"Now married, except separated",Non-Veteran,English only,Speak English well,High school graduate,US Born,US Citizen,State of residence,Florida,FL,In Labor Force,Employed
4,20 to 24 years,Female,Hispanic,Other Hispanic or Latino,Primary Householder,Single Female Without kids,"Now married, except separated",Non-Veteran,Spanish,Speak English less than very well,Bachelor's degree,US Born,US Citizen,State of residence,Florida,FL,Not in Labor Force,Not Applicable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,25 to 34 years,Male,White,,Primary Householder,Single Male Without kids,Divorced,Non-Veteran,Other Indo-European languages,Speak English less than very well,Bachelor's degree,US Born,US Citizen,State of residence,Florida,FL,In Labor Force,Employed
996,60 to 64 years,Female,Black or African American,,Child living with Parents,,Never married,Non-Veteran,Other Indo-European languages,Speak English less than very well,High school graduate,Foreign Born,Not a U.S. citizen,Latin America,Florida,FL,In Labor Force,Employed
997,55 to 59 years,Male,White,,Other Relative of Householder,,Never married,Veteran,English only,Speak English well,"Some college, no degree",US Born,US Citizen,State of residence,Florida,FL,In Labor Force,Employed
998,75 to 84 years,Male,White,,Spouse of Householder,,Never married,Non-Veteran,English only,Speak English well,Less than 9th grade,Foreign Born,US Citizen,Latin America,Florida,FL,Not in Labor Force,Not Applicable
