In [1]:
"""
# ACS Data Pull
Last Updated: 12/21/22
@original author: Emanuel Lucban
@revised: Michael Ma
"""

'\n# ACS Data Pull\nLast Updated: 12/21/22\n@original author: Emanuel Lucban\n@revised: Michael Ma\n'

In [2]:
import pandas as pd
import numpy as np
import json
import re 
import os
import requests
from IPython.display import display, HTML

## ACS Detailed Summary File (B, C Table) 2010-2021

In [None]:
def clean_header(yr_id, header):
    """Helper to clean ACS header names
    Args:
        yr_id: Year id of the data.
        header: list of header names.
    Returns:
        Returns a list of clean header names (Pass a list of header names to generate the discriptions)
    """
    # Get variable definitions from Census Bureau
    # Insert the year of the ACS data
    # Exmaple call: https://api.census.gov/data/2021/acs/acs5/variables.json'. Examine the structure of the json file
    var_url = f'https://api.census.gov/data/{yr_id}/acs/acs5/variables.json'
    acs_vars = requests.get(var_url).json()
    fixed_header = []
    for i in header:
        try:
            var_name = acs_vars['variables'][i]['label']
        except KeyError:
            var_name = i

        # Fix header formatting
        if len(var_name.split('!!')) > 1:
            var_name = var_name.split('!!')[-1]
        fixed_header.append(var_name)
    return fixed_header



''' Example Call (Examine each elements of the clean header function)
# clean_header(2021,['B01001_001E'])
# var_url = 'https://api.census.gov/data/2021/acs/acs5/variables.json'
# acs_vars = requests.get(var_url).json()
# acs_vars['variables']['B01001_001E']['label']
'''

In [3]:
def acs_data(yr_id, table, est_type, summary_level, state, county, api_key):
    """Collects ACS 5yr Estimates data from Census API
    Args:
        yr_id: Year id of the data.
        table: ACS Table.
        var_list: list of ACS variables to include
    Returns:
        Dataframe of collected ACS table data.
    """
    est_types = {'1Y': '1', '5Y': '5', '1YS': 'se'}
    summary_levels = {'010': '&for=us:*',
                      '040': f'&for=state:{state}',
                      '050': f'&for=county:*&in=state:{state}',
                      '060': f'&for=county%20subdivision:*&in=state:{state}%20county:*',
                      '140': f'&for=tract:*&in=state:{state}%20county:{county}',
                      '150': f'&for=block%20group:*&in=state:{state}%20county:{county}%20tract:*',
                      '160': f'&for=place:*&in=state:{state}',
                      '310': f'&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:*',
                      '330': f'&for=combined%20statistical%20area:*',
                      '795': f'&for=public%20use%20microdata%20area:*&in=state:{state}',
                      '860': f'&for=zip%20code%20tabulation%20area:*',
                      '950': f'&for=school%20district%20(elementary):*&in=state:{state}',
                      '960': f'&for=school%20district%20(secondary):*&in=state:{state}',
                      '970': f'&for=school%20district%20(unified):*&in=state:{state}'}
    geo_col = {'010': ['us'],
               '040': ['state'],
               '050': ['state', 'county'], 
               '060': ['state', 'county subdivision'],
               '140': ['state', 'county', 'tract'],
               '150': ['state', 'county', 'tract', 'block group'],
               '160': ['state', 'place'],
               '310': ['metropolitan statistical area/micropolitan statistical area'],
               '330': ['combined statistical area'],
               '795': ['state', 'public use microdata area'],
               '860': ['zip code tabulation area'],
               '950': ['state', 'school district (elementary)'],
               '960': ['state', 'school district (secondary)'],
               '970': ['state', 'school district (unified)']}
    
    api_url = 'https://api.census.gov/data/%s/acs/acs%s?get=NAME,group(%s)' % (yr_id, est_types[est_type], table) # This api url determines the year, 5Y or 1Y, and the table name.
    api_url = api_url + summary_levels[summary_level] + '&key=' + api_key   # Each summary levels has different API call
    acs_json = requests.get(api_url).json()

    # First row is header, the census tracts (or other geos) starts from the second row
    # Example call to examine json structure on website
    # https://api.census.gov/data/2021/acs/acs5?get=NAME,group(B01001)&for=tract:*&in=state:06%20county:073&key=
    acs_df = pd.DataFrame(acs_json[1:], columns=acs_json[0])
    
    # Regex to capture strings
    # Example: B01001_003M 
    # ^: Starts with
    # \w: Returns a match where the string contains any word characters
    # \d: Returns a match where the string contains digits (numbers from 0-9)
    # This matches the first six strings including a word (B01001), and the three digits before the E or M (003M,estimates & moe columns)
    # And match the E or M exaxtly one time.
    # $ Ends with
    r = re.compile("^\\w{6}_\\d{3}(E{1}|M{1})$")
    filtered_cols = list(filter(r.match, acs_json[0]))
    
    # subset columns. Example: summary_level=140, ['state', 'county', 'tract'] + ['GEO_ID'] + filtered_cols E & M
    acs_df = acs_df[geo_col[summary_level] + ['GEO_ID'] + filtered_cols]
    
    # Melt the variable columns (different table names). Geo columns and geo_id should be index vars
    melted = acs_df.melt(id_vars=geo_col[summary_level] + ['GEO_ID'])
    # Extract the type based on last character, either E or M (var_type)
    # Get the clean variable name without E and M (var)
    melted['var_type'] = melted['variable'].str[-1:]
    melted['var'] = melted['variable'].str[:-1]
    # Don't need the variables column anymore
    melted = melted.drop(columns='variable')
    
    # After creating var_type as part of multi-index, unstack it to get columns E and M. (-1 to get the var_type)
    # Drop column level, drop unnecessary names
    # Reset index and rename the columns
    acs_unpivot = melted.set_index(['var'] + geo_col[summary_level] + ['GEO_ID', 'var_type']).unstack(level=-1)
    acs_unpivot.columns = acs_unpivot.columns.droplevel(0)
    acs_unpivot.columns.name = None
    acs_unpivot = acs_unpivot.reset_index(drop=False).rename(columns={'E': 'estimate', 'M': 'moe'})
    
    # Call the clean header function to return the description of the table
    # Since the (acs_unpivot['var'] + 'E').unique().tolist() returns the description by order, we can just put two columns in a dataframe
    dim_line = clean_header(yr_id, (acs_unpivot['var'] + 'E').unique().tolist())
    tab_line = pd.DataFrame({'var': acs_unpivot['var'].unique().tolist(), 'line_desc': dim_line})
    
    # Fixing the Geo_IDs. For some reason Census Bureau uses a 7 digit summary level encoding in the API data and only a 5 digit encoding in
    # the geo file - typical government operation
    acs_unpivot['geoid'] = summary_level + '00' + acs_unpivot['GEO_ID'].str[7:]
    
    # Join with the tabe_line to get the variable name and description
    # Get the line number from the last three digits of variable name
    # Return the final table
    acs_unpivot = pd.merge(acs_unpivot, tab_line, how='left', left_on='var', right_on='var').sort_values(geo_col[summary_level] + ['var'])
    acs_unpivot['line_number'] = acs_unpivot['var'].str[-3:].astype('float')
    acs_unpivot['yr'] = yr_id
    acs_unpivot['summary_level'] = summary_level
    acs_unpivot['release_type_id'] = est_type
    acs_unpivot['subject_table_name'] = table
    acs_unpivot.drop(columns=['var', 'GEO_ID'], inplace=True)

    return acs_unpivot

## ACS Subject Summary File (S Table) Michael's attempt

In [None]:
def clean_header_s_table(yr_id, header):
    """
    arguments: 
    yr_id - the year of the ACS table, example: 2021
    header - Table name, example: ['S1601']

    Returns:
    Returns a list of clean header names (Pass a list of header names to generate the discriptions)
    """

    # Get variable definitions from Census Bureau (modified to subejct table's api call)
    # Iterate through each passed header names (A list) and identify the corresponded table description )
    # Exmaple call: https://api.census.gov/data/2021/acs/acs5/subject/variables.json. Examine the structure of the json file
    var_url_s = f'https://api.census.gov/data/{yr_id}/acs/acs5/subject/variables.json'
    acs_variables = requests.get(var_url_s).json()
    fixed_header = []
    for i in header:
        try:
            var_name = acs_variables['variables'][i]['label']
        except KeyError:
            var_name = i

        # Fix header formatting
        # The descriptions are splitted by '!!'. Take the last element as the field header.
        if len(var_name.split('!!')) > 1:
            var_name = var_name.split('!!')[-1]
        fixed_header.append(var_name)
        
    return fixed_header



''' Example Call (Examine each elements of the clean header function)
# clean_header_s_table(2021,['S1601_C05_023E'])
# var_url = 'https://api.census.gov/data/2021/acs/acs5/subject/variables.json'
# acs_vars = requests.get(var_url).json()
# acs_vars['variables']['S1601_C05_023E']['label']
'''

In [89]:
def acs_s_data(yr_id, table, est_type, summary_level, state, county, api_key):
    """Collects ACS 5yr Estimates data from Census API
    Args:
        yr_id: Year id of the data.
        table: ACS Table.
        var_list: list of ACS variables to include
    Returns:
        Dataframe of collected ACS table data.
    """
    est_types = {'1Y': '1', '5Y': '5', '1YS': 'se'}
    summary_levels = {'010': '&for=us:*',
                      '040': f'&for=state:{state}',
                      '050': f'&for=county:*&in=state:{state}',
                      '060': f'&for=county%20subdivision:*&in=state:{state}%20county:*',
                      '140': f'&for=tract:*&in=state:{state}%20county:{county}',
                      '150': f'&for=block%20group:*&in=state:{state}%20county:{county}%20tract:*',
                      '160': f'&for=place:*&in=state:{state}',
                      '310': f'&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:*',
                      '330': f'&for=combined%20statistical%20area:*',
                      '795': f'&for=public%20use%20microdata%20area:*&in=state:{state}',
                      '860': f'&for=zip%20code%20tabulation%20area:*',
                      '950': f'&for=school%20district%20(elementary):*&in=state:{state}',
                      '960': f'&for=school%20district%20(secondary):*&in=state:{state}',
                      '970': f'&for=school%20district%20(unified):*&in=state:{state}'}
    geo_col = {'010': ['us'],
               '040': ['state'],
               '050': ['state', 'county'], 
               '060': ['state', 'county subdivision'],
               '140': ['state', 'county', 'tract'],
               '150': ['state', 'county', 'tract', 'block group'],
               '160': ['state', 'place'],
               '310': ['metropolitan statistical area/micropolitan statistical area'],
               '330': ['combined statistical area'],
               '795': ['state', 'public use microdata area'],
               '860': ['zip code tabulation area'],
               '950': ['state', 'school district (elementary)'],
               '960': ['state', 'school district (secondary)'],
               '970': ['state', 'school district (unified)']}
    
    api_url = 'https://api.census.gov/data/%s/acs/acs%s/subject?get=NAME,group(%s)' % (yr_id, est_types[est_type], table) # This api url determines the year, 5Y or 1Y, and the table name.
    api_url = api_url + summary_levels[summary_level] + '&key=' + api_key # Each summary levels has different API call
    acs_json = requests.get(api_url).json()
    
    # First row is header, the census tracts (or other geos) start from the second row
    # Example call to examine json structure on website
    # https://api.census.gov/data/2021/acs/acs5/subject?get=NAME,group(S1601)&for=tract:*&in=state:06%20county:073&key=
    acs_df = pd.DataFrame(acs_json[1:], columns=acs_json[0])


    # Regex to capture strings
    # Example: S1601_C05_023E 
    # ^: Starts with
    # \w: Returns a match where the string contains any word characters
    # \d: Returns a match where the string contains digits (numbers from 0-9)
    # This matches the first five strings including a word (S1601), the three strings including a word (C05), and the three digits before the E or M (023E, estimates & moe columns)
    # And match the E or M exaxtly one time.
    # $ Ends with
    r = re.compile("^\\w{5}_\\w{3}_\\d{3}(E{1}|M{1})$")
    filtered_cols = list(filter(r.match, acs_json[0]))
    # subset columns. Example: summary_level=140, ['state', 'county', 'tract'] + ['GEO_ID'] + filtered_cols E & M
    acs_df = acs_df[geo_col[summary_level] + ['GEO_ID'] + filtered_cols]

    
    # Melt the variable columns (different table names). Geo columns and geo_id should be index vars
    melted = acs_df.melt(id_vars=geo_col[summary_level] + ['GEO_ID'])
    # Extracts the type based on last character, either E or M (var_type)
    # Get the clean variable name without E and M (var)
    melted['var_type'] = melted['variable'].str[-1:]
    melted['var'] = melted['variable'].str[:-1]
    # Don't need the variables column anymore
    melted = melted.drop(columns='variable')


    # After creating var_type as part of multi-index, unstack it to get columns E and M. (-1 to get the var_type)
    # Drop column level, drop unnecessary names
    # Reset index and rename the columns
    acs_unpivot = melted.set_index(['var'] + geo_col[summary_level] + ['GEO_ID', 'var_type']).unstack(level=-1)
    acs_unpivot.columns = acs_unpivot.columns.droplevel(0)
    acs_unpivot.columns.name = None
    acs_unpivot = acs_unpivot.reset_index(drop=False).rename(columns={'E': 'estimate', 'M': 'moe'})


    # Call the clean header function to return the description of the table
    # Since the (acs_unpivot['var'] + 'E').unique().tolist() returns the description by order, we can just put two columns in a dataframe
    dim_line = clean_header_s_table(yr_id, (acs_unpivot['var'] + 'E').unique().tolist())
    tab_line = pd.DataFrame({'var': acs_unpivot['var'].unique().tolist(), 'line_desc': dim_line})
    

    # Fixing the Geo_IDs. For some reason Census Bureau uses a 7 digit summary level encoding in the API data and only a 5 digit encoding in
    # the geo file - typical government operation
    acs_unpivot['geoid'] = summary_level + '00' + acs_unpivot['GEO_ID'].str[7:]
    

    # Join with the tabe_line to get the variable name and description
    # Get the line number from the last three digits of variable name
    # Return the final table
    acs_unpivot = pd.merge(acs_unpivot, tab_line, how='left', left_on='var', right_on='var').sort_values(geo_col[summary_level] + ['var'])
    acs_unpivot['line_number'] = acs_unpivot['var'].str[-3:].astype('float')
    acs_unpivot['yr'] = yr_id
    acs_unpivot['summary_level'] = summary_level
    acs_unpivot['release_type_id'] = est_type
    acs_unpivot['subject_table_name'] = table
    acs_unpivot.drop(columns=['var', 'GEO_ID'], inplace=True)

    return acs_unpivot


'''
test = acs_s_data(2021, 'S1601', '5Y', '140', '06', '073', '')
'''