## Code for Generating ACS Metadata

In [1]:
import pandas as pd
import numpy as np
import pygris
from pygris.data import get_census
pd.set_option("chained_assignment", None)

In [2]:
# Record raw variable names for Spielman et al's data and our own
acs_variables = [
    "B01002_001E", # median age
    "B03002_001E", # total population of respondents to race/ethnicity
    "B03002_004E", # total black
    "B03002_005E", # total native american
    "B03002_006E", # total asian
    "B03002_012E", # total latinx
    "B06001_002E", # total under 5
    "B09020_001E", # total above 65 
    "B01003_001E", # total population 
    "B25008_001E", # total population in occupied housing units
    "B25002_002E", # total occupied housing units 
    "B25003_003E", # total renter occupied housing units
    "B25002_001E", # total housing units for which occupancy status is known
    "B09020_021E", # total 65+ living in group quarters
    "B01001_026E", # total female
    "B11001_006E", # total female-headed family households
    "B11001_001E", # total households 
    "B25002_003E", # total vacant housing units
    "B19025_001E", # aggregate household income    
    "B23022_025E", # total male unemployed for last 12 months
    "B23022_049E", # total female unemployed for last 12 months
    "B23022_001E", # total for unemployment by sex stats
    "B17021_002E", # total pop below poverty level
    "B17021_001E", # total pop for which poverty info available 
    "B25024_010E", # number of mobile home housing units in structure
    "B25024_001E", # total units in structure
    "C24010_038E", # total female employed
    "C24010_001E", # total for which sex and occupation known
    "B19055_002E", # total households with social security income
    "B19055_001E", # total households for which social security income status known 
    "B09002_002E", # total children in married couple families
    "B09002_001E", # total children by family type and age
    "B19001_017E", # total households over 200k income
    "B06007_005E", # total speak spanish, speak english less than very well
    "B06007_008E", # total speak another language, speak english less than very well
    "B06007_001E", # total speak another language 
    "B16010_002E", # total less than high school
    "B16010_001E", # total for which education, employment, language at home known 
    "C24050_002E", # total in extractive industries
    "C24050_001E", # total for which industry known 
    "C24050_029E", # total in service occupations
    "B08201_002E", # total households no vehicle available
    "B08201_001E", # total households for which vehicle status and family size known 
    "B25064_001E", # median gross rent
    "B25077_001E"  # median home value
]
    
spielman_acs_variables = [
    "ACS12_5yr_B01002001", # median age
    "ACS12_5yr_B03002001", # total population of respondents to race/ethnicity
    "ACS12_5yr_B03002004", # total black
    "ACS12_5yr_B03002005", # total native american
    "ACS12_5yr_B03002006", # total asian
    "ACS12_5yr_B03002012", # total latinx
    "ACS12_5yr_B06001002", # total under 5 
    "ACS12_5yr_B09020001", # total above 65 
    "ACS12_5yr_B01003001", # total population 
    "ACS12_5yr_B25008001", # total population in occupied housing units
    "ACS12_5yr_B25002002", # total occupied housing units
    "ACS12_5yr_B25003003", # total renter occupied housing units
    "ACS12_5yr_B25002001", # total housing units for which occupancy status is known
    "ACS12_5yr_B09020021", # total 65+ living in group quarters
    "ACS12_5yr_B01001026", # total female
    "ACS12_5yr_B11001006", # total female-headed family households
    "ACS12_5yr_B11001001", # total households 
    "ACS12_5yr_B25002003", # total vacant housing units
    "ACS12_5yr_B19025001", # aggregate household income
    "ACS12_5yr_B23022025", # total male unemployed for last 12 months
    "ACS12_5yr_B23022049", # total female unemployed for last 12 months
    "ACS12_5yr_B23022001", # total for unemployment by sex stats
    "ACS12_5yr_B17021002", # total pop below poverty level
    "ACS12_5yr_B17021001", # total pop for which poverty info available 
    "ACS12_5yr_B25024010", # number of mobile home housing units in structure
    "ACS12_5yr_B25024001", # total units in structure
    "ACS12_5yr_C24010038", # total female employed
    "ACS12_5yr_C24010001", # total for which sex and occupation known
    "ACS12_5yr_B19055002", # total households with social security income
    "ACS12_5yr_B19055001", # total households for which social security income status known
    "ACS12_5yr_B09002002", # total children in married couple families
    "ACS12_5yr_B09002001", # total children by family type and age
    "ACS12_5yr_B19001017", # total households over 200k income
    "ACS12_5yr_B06007005", # total speak spanish, speak english less than very well
    "ACS12_5yr_B06007008", # total speak another language, speak english less than very well
    "ACS12_5yr_B06007001", # total speak another language 
    "ACS12_5yr_B16010002", # total less than high school
    "ACS12_5yr_B16010001", # total for which education, employment, language at home known 
    "ACS12_5yr_C24050002", # total in extractive industries
    "ACS12_5yr_C24050001", # total for which industry known 
    "ACS12_5yr_C24050029", # total in service occupations
    "ACS12_5yr_B08201002", # total households no vehicle available
    "ACS12_5yr_B08201001", # total households for which vehicle status and family size known
    "ACS12_5yr_B25064001", # median gross rent
    "ACS12_5yr_B25077001", # median home value
]

In [3]:
counties_detailed = get_census(dataset = "acs/acs5", # dataset name on the Census API you are connecting to; find datasets at https://api.census.gov/data.html
                        variables = acs_variables, # string (or list of strings) of desired vars. For the 2021 5-year ACS Data Profile, those variable IDs are found at https://api.census.gov/data/2021/acs/acs5/profile/variables.html
                        year = 2012, # year of your data (or end-year for a 5-year ACS sample)
                        params = { # dict of query parameters to send to the API.
                          "for": "county:*"},
                        guess_dtypes = True,
                        return_geoid = True)

In [4]:
# Document metadata
var_list = pd.read_html("http://api.census.gov/data/2012/acs/acs5/variables.html")[0]

# Convert list to DataFrame
acs_meta = pd.DataFrame( {"Name": acs_variables} )

# Create spielman labels
acs_meta["Spielman Label"] = spielman_acs_variables

# Manually create alias column
acs_meta["Alias"] = ["median age",
                      "total population of respondents to race/ethnicity",
                      "total Black population",
                      "total Native American population",
                      "total Asian population",
                      "total Latinx population",
                      "total population under 5 years of age",
                      "total population over 65 years of age",
                      "total population",
                      "total population in occupied housing units",
                      "total occupied housing units",
                      "total renter occupied housing units",
                      "total housing units for which occupancy status is known",
                      "total 65+ living in group quarters",
                      "total female population",
                      "total female-headed family households",
                      "total households for which household type is known",
                      "total vacant housing units",
                      "aggregate household income",
                      "total males unemployed for last 12 months",
                      "total females unemployed for last 12 months",
                      "total population for which unemployment and sex cross-tabulations known",
                      "total population below poverty level",
                      "total population for which poverty information available",
                      "number of mobile home housing units in structure",
                      "total housing units in structure",
                      "total female employed",
                      "total population for which sex and occupation known",
                      "total households with social security income",
                      "total households for which social security income status known",
                      "total children in married couple families",
                      "total children for which family type and age are known",
                      "total households with over 200k income",
                      "total Spanish-speakers who speak english less than very well",
                      "total people who speak another language and speak English less than very well",
                      "total population with known language spoken at home and English ability",
                      "total population with less than a high school graduate education",
                      "total for which education, employment, language at home known",
                      "total population in extractive industries",
                      "total population for which industry known",
                      "total people in service occupations",
                      "total households with no available vehicle",
                      "total households for which vehicle status and family size known",
                      "median gross rent",
                      "median home value"]

# Join metadata from census website
acs_meta = acs_meta.merge(var_list, on = "Name")[["Name", "Spielman Label", "Label", "Concept", "Alias"]]

# Merge information from two columns into one
acs_meta["Definition"] = acs_meta["Concept"] + ": " + acs_meta["Label"]
acs_meta = acs_meta.drop( ["Concept", "Label"], axis = 1 )

# Create data type column
acs_meta["Type"] = np.NaN

for i in range(len(acs_meta)):
    name = acs_meta["Name"][i]
    acs_meta["Type"][i] = counties_detailed[name].dtype
    
# Rename name to label
acs_meta = acs_meta.rename(columns={"Name": "Reproduction Label"})

# Create domain column
acs_meta["Domain"] = np.NaN

for i in range(len(acs_variables)):
    var = acs_meta["Reproduction Label"][i]
    
    var_min = counties_detailed[var].min()
    var_max = counties_detailed[var].max()
    
    if var_min%1 == 0:
        var_min = round(var_min)
    if var_max%1 == 0:
        var_max = round(var_max)
        
    acs_meta["Domain"][i] = str(var_min) + " - " + str(var_max)

# Create missing data column
acs_meta["Missing Data Value(s)"] = np.NaN

# Create missing data frequency
acs_meta["Missing Data Frequency"] = np.NaN

for i in range(len(acs_variables)):
    var = acs_meta["Reproduction Label"][i]
    
    acs_meta["Missing Data Frequency"][i] = np.isnan(counties_detailed[var]).sum()

In [5]:
geoid = pd.DataFrame(
    [["GEOID", 'Geo_FIPS', 'FIPS code unique identifier', 'Unique code for every county and county-equivalent in USA', 'string', '01001 - 56045', 'None', 0]],
    columns= acs_meta.columns
)

In [6]:
acs_meta = pd.concat([geoid,acs_meta])

In [7]:
acs_meta.to_csv('ACS_2012_data_dictionary.csv')