# Downloading Census Data for Transportation, Infrastructure, and Housing

I wanted to use the package "censusdis" and/or the package "census" to download census data 
using the API, but we don't have access to it in EDL. I asked Brock Webb about it.

Brock said, "That can be done via a remedy request to install the package. 
The bad thing is... even if you do that, I didn't think EDL had open Internet access, 
so it doesn't work. That was a struggle I had with the python packages. They made it 
very easy to get the data as demonstrated today. However, I couldn't do that, so I 
had to get it manually from data.census.gov and figure out how to get what I wanted 
into a table so I could import the file. "

So instead of using a package, I'm using the following code.

If there is time, these are the things I would add to the code:

1. Full distributions for home value and time of departure
2. Where 1-year data are available, use 1-year instead of 5-year (applies to county and tribal areas)
3. Add block group data
4. Add more "loops" and "if-else" code so I don't need as many functions
5. Make adjustments for geo changes over time
6. Make adjustments for Puerto Rico
7. Make adjustments for tribal area name
8. QC

In [1]:
# Setting up libraries
import requests
import pandas as pd
import numpy as np

#Create a blank dataframe for appending data
house = pd.DataFrame()
internet = pd.DataFrame()

# year loop for different geos
vin = ["2022","2021","2019","2018","2017","2016","2015"]
vina = ["2022","2021","2020","2019","2018","2017","2016","2015"]
vinb = ["2022","2021","2020","2019"]
vinc = ["2018","2017","2016","2015"]
vind = ["2022","2021","2019","2018","2017"]
vine = ["2018","2017"]

# variables for data profiles
get_vars_dp = (["NAME", "DP04_0001E", "DP04_0001M", #total HUs
                "DP04_0089E", "DP04_0089M", #median home value
                "DP04_0134E", "DP04_0134M", #rent
                "DP03_0019PE", "DP03_0019PM", #drove alone
                "DP03_0020PE", "DP03_0020PM", #carpooled
                "DP03_0021PE", "DP03_0021PM", #public transport
                "DP03_0022PE", "DP03_0022PM", #walked
                "DP03_0025E", "DP03_0025E", #average commute
                "DP04_0077PE", "DP04_0077PM", #1 or less
                "DP04_0078PE", "DP04_0078PM", #1 to 1.5
                "DP04_0079PE", "DP04_0079PM", #1.5
                "DP04_0063PE", "DP04_0063PM", #utility gas
                "DP04_0064PE", "DP04_0064PM", #tank gas
                "DP04_0065PE", "DP04_0065PM", #electricity
                "DP04_0066PE", "DP04_0066PM", #kerosene
                "DP04_0069PE", "DP04_0069PM", #solar
                "DP04_0071PE", "DP04_0071PM", #no fuel
                "DP03_0034E", "DP03_0034M", #construction
                "DP04_0003PE", "DP04_0003PM", #vacancy
                "GEO_ID"])

# variable names for subject tables
get_vars_sub = ["NAME", "S2801_C02_017E", "S2801_C02_017M", #broadband
                "S2801_C02_015E", "S2801_C02_015M", #cellular
                "S2801_C02_018E", "S2801_C02_018E", #satellite
                "GEO_ID"]
    
# column names for data profiles
col_names_dp = ['Geo_name', 'Total_housing_units', 'Total_housing_units_moe',
                 'Median_home_value', 'Median_home_value_moe',
                 'Median_gross_rent', 'Median_gross_rent_moe',
                 "Percent_drove_alone", "Percent_drove_alone_moe",
                 "Percent_carpooled", "Percent_carpooled_moe",
                 "Percent_public_transportation", "Percent_public_transportation_moe",
                 "Percent_walked", "Percent_walked_moe",
                 "Average_commute_time", "Average_commute_time_moe",
                 "Percent_with_1.00_occupent_or_less", "Percent_with_1.00_occupent_or_less_moe",
                 "Percent_with_1.01_to_1.50_occupents", "Percent_with_1.01_to_1.50_occupents_moe",
                 "Percent_with_1.51_or_more_occupents", "Percent_with_1.51_or_more_occupents_moe",
                 "Percent_with_utility_gas", "Percent_with_utility_gas_moe",
                 "Percent_with_bottled_tank_lp_gas", "Percent_with_bottled_tank_lp_gas_moe",
                 "Percent_with_electricity", "Percent_with_electricity_moe",
                 "Percent_with_oil_kerosene_etc", "Percent_with_oil_kerosene_etc_moe",
                 "Percent_with_solar", "Percent_with_solar_moe",
                 "Percent_with_no_fuel", "Percent_with_no_fuel_moe",
                 "Number_employed_construction", "Number_employed_construction_moe",
                 "Percent_vacant", "Percent_vacant_moe"]

# column names for subject tables
col_names_sub = ['Geo_name', 'Percent_with_broadband_cable_fiber_esl', 'Percent_with_broadband_cable_fiber_esl_moe',
                 'Percent_with_cellular_data_plan', 'Percent_with_cellular_data_plan_moe',
                 'Percent_with_satellite_internet', 'Percent_with_satellite_internet_moe']

In [2]:
#This is the function for the nation for data profiles

def api_us(year, dataset):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = "us:*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ['GEO_ID',"fips"]
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "national"
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = "us"
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [3]:
#run for all years except 2020
for x in vin:
    api_us(year=x, dataset = "acs/acs1/profile?")

#run for 2020 (could make vin loop a dictionary and not have to run this part...)
api_us(year="2020", dataset = "acs/acs5/profile?")

In [4]:
#This is the function for region and division for data profiles

def api_rd(year, dataset, geo_for, area_type, geo_id):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = geo_for

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ['GEO_ID',"fips"]
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = area_type
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df[geo_id] 
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [5]:
#region
#run for all years except 2020
for x in vin:
    api_rd(year=x, dataset = "acs/acs1/profile?", geo_for = "region:*", area_type="region", geo_id = ["fips"])

#run for 2020 (could make vin loop a dictionary and not have to run this part...)
api_rd(year="2020", dataset = "acs/acs5/profile?", geo_for = "region:*", area_type="region", geo_id = ["fips"])

In [6]:
#division
#run for all years except 2020
for x in vin:
    api_rd(year=x, dataset = "acs/acs1/profile?", geo_for = "division:*", area_type="division", geo_id = ["fips"])

#run for 2020 (could make vin loop a dictionary and not have to run this part...)
api_rd(year="2020", dataset = "acs/acs5/profile?", geo_for = "division:*", area_type="division", geo_id = ["fips"])

In [7]:
#This is the function for state for data profiles

def api_st(year, dataset):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = "state:*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ["fips",'GEO_ID']
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "state"
    df["state_fips"] = df['fips'].str[9:11]
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df['fips'].str[9:11]
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [8]:
#run for all years except 2020
for x in vin:
    api_st(year=x, dataset = "acs/acs1/profile?")

#run for 2020 (could make vin loop a dictionary and not have to run this part...)
api_st(year="2020", dataset = "acs/acs5/profile?")

In [9]:
#This is the function for county for data profiles

def api_cty(year, dataset):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = "county:*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ['GEO_ID',"state_fips","county_fips"]    
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "county"
    df["tract_fips"] = ""
    df["GEO_ID"] = df['GEO_ID'].str[9:14]
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [10]:
#run for all years
for x in vina:
    api_cty(year=x, dataset = "acs/acs5/profile?")

In [11]:
#This is the function for tribal areas for data profiles (2022 - 2019)

def api_aian(year, dataset):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = "american indian area/alaska native area (reservation or statistical entity only):*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ['fips','GEO_ID']
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "tribal_area"
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df['fips'].str[9:14]
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [12]:
#run for years (2022 - 2019)
for x in vinb:
    api_aian(year=x, dataset = "acs/acs5/profile?")

In [13]:
#This is the function for tribal areas for data profiles (2015 - 2018)

def api_aian(year, dataset):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = "american indian area/alaska native area (reservation or statistical entity only):*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ['fips','GEO_ID','R']
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "tribal_area"
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df['fips'].str[9:14]
    df.drop('fips', axis=1, inplace=True)
    df.drop('R', axis=1, inplace=True)
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [14]:
#run for years (2015 - 2018)
for x in vinc:
    api_aian(year=x, dataset = "acs/acs5/profile?")

In [15]:
#This is the function for tracts for data profiles

def api_tract(year, dataset):
    global house

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_dp)
    predicates["for"] = "tract:*"
    predicates["in"] = "state:01,02,04,05,06,08,09,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,53,54,55,56,72"


    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_dp + ['GEO_ID','state_fips', 'county_fips', 'tract_fips']
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "tract"
    df["GEO_ID"] = df['GEO_ID'].str[9:20]
   
    #append data
    house = pd.concat([house,df], ignore_index=True)

In [16]:
#run for all years
for x in vina:
    api_tract(year=x, dataset = "acs/acs5/profile?")

The following functions are for internet data (only back until 2017).

In [17]:
#This is the function for the nation for internet

def api_us(year, dataset):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = "us:*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ['GEO_ID',"fips"]
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "national"
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = "us"
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [18]:
#run for all years
for x in vind:
    api_us(year=x, dataset = "acs/acs1/subject?")
    
api_us(year="2020", dataset = "acs/acs5/subject?")

In [19]:
#This is the function for region and division for internet

def api_rd(year, dataset, geo_for, area_type, geo_id):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = geo_for

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ['GEO_ID',"fips"]
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = area_type
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df[geo_id]
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [20]:
#region
#run for all years
for x in vind:
    api_rd(year=x, dataset = "acs/acs1/subject?", geo_for = "region:*", area_type="region", geo_id = ["fips"])
    
api_rd(year="2020", dataset = "acs/acs5/subject?", geo_for = "region:*", area_type="region", geo_id = ["fips"])

In [21]:
#division
#run for all years
for x in vind:
    api_rd(year=x, dataset = "acs/acs1/subject?", geo_for = "division:*", area_type="division", geo_id = ["fips"])
    
api_rd(year="2020", dataset = "acs/acs5/subject?", geo_for = "division:*", area_type="division", geo_id = ["fips"])

In [22]:
#This is the function for the state for internet

def api_st(year, dataset):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = "state:*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ["fips", 'GEO_ID']    
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "state"
    df["state_fips"] = df['fips'].str[9:11]
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df['fips'].str[9:11]
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [23]:
#run for all years
for x in vind:
    api_st(year=x, dataset = "acs/acs1/subject?")
    
api_st(year="2020", dataset = "acs/acs5/subject?")

In [24]:
#This is the function for counties for internet

def api_cty(year, dataset):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = "county:*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ['GEO_ID',"state_fips","county_fips"]
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "county"
    df["tract_fips"] = ""
    df["GEO_ID"] = df['GEO_ID'].str[9:14]
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [25]:
vinf = ["2022","2021","2020","2019","2018","2017"]

#run for all years
for x in vinf:
    api_cty(year=x, dataset = "acs/acs5/subject?")

In [26]:
#This is the function for tribal areas for internet (2019 - 2020)

def api_aian(year, dataset):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = "american indian area/alaska native area (reservation or statistical entity only):*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ["fips",'GEO_ID']
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "tribal_area"
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df['fips'].str[9:14]
    df.drop('fips', axis=1, inplace=True)
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [27]:
#run for all years
for x in vinb:
    api_aian(year=x, dataset = "acs/acs5/subject?")

In [28]:
#This is the function for tribal areas for internet (2017 - 2018)

def api_aian(year, dataset):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = "american indian area/alaska native area (reservation or statistical entity only):*"

    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ["fips",'GEO_ID',"R"]
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "tribal_area"
    df["state_fips"] = ""
    df["county_fips"] = ""
    df["tract_fips"] = ""
    df["GEO_ID"] = df['fips'].str[9:14]
    df.drop('fips', axis=1, inplace=True)
    df.drop('R', axis=1, inplace=True)
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [29]:
#run for all years
for x in vine:
    api_aian(year=x, dataset = "acs/acs5/subject?")

In [30]:
#This is the function for tracts

def api_tract(year, dataset):
    global internet

    # Build base URL
    HOST = "https://api.census.gov/data"
    base_url = "/".join([HOST, year, dataset])

    # Specify Census variables and other predicates
    predicates = {}
    predicates["get"] = ",".join(get_vars_sub)
    predicates["for"] = "tract:*"
    predicates["in"] = "state:01,02,04,05,06,08,09,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,53,54,55,56,72"


    # Execute the request, examine text of response object
    r = requests.get(base_url, params=predicates)

    # Construct the DataFrame
    col_names = col_names_sub + ['GEO_ID','state_fips', 'county_fips', 'tract_fips']
    df = pd.DataFrame(columns = col_names, data = r.json()[1:])
                
    # Create additional variables for file
    df["year"] = year
    df["area_type"] = "tract"
    df["GEO_ID"] = df['GEO_ID'].str[9:20]
   
    #append data
    internet = pd.concat([internet,df], ignore_index=True)

In [31]:
#run for all years
for x in vinf:
    api_tract(year=x, dataset = "acs/acs5/subject?")

In [32]:
# Merge housing and internet data

Housing_trans_infra_measures = house.merge(internet, how='left', left_on=['Geo_name','GEO_ID','year','area_type','state_fips','county_fips','tract_fips'],
                                             right_on=['Geo_name','GEO_ID','year','area_type','state_fips','county_fips','tract_fips'])

In [33]:
# Create CSV
Housing_trans_infra_measures.to_csv("/data/discover/Data/Infrastructure and Transportation/Housing_trans_infra_measures.csv", header=True, index=False) 