In [None]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from IPython.display import display


In [None]:
%load_ext autoreload
%autoreload 2


### Load official statistics:

**2018 ACS 1-Year Estimates Subject Tables :**

- [`S0101`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S0101.html):
"Total population : SELECTED AGE CATEGORIES".

- [`S1901`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S1901.html):
"Households : Total".

- [`S2301`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S2301.html): 
"Labor Force Participation Rate".

- [`B28010`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S2801.html): 
"Has one or more types of computing devices : Smartphone".

**ACS Cartographic Boundary Shapefile:**

- [`cb_2018_us_county_500k.gdb`](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.2018.html):
"Land area by county".

**2011-2015 5-Year ACS Commuting Flows (from Micro-data):***

- [`Table 1`](https://www.census.gov/data/tables/2015/demo/metro-micro/commuting-flows-2015.html): 
"Residence County to Workplace County Commuting Flows for the United States and Puerto Rico Sorted by Residence Geography: 5-Year ACS, 2011-2015".

**National Center for Health Statistics (NCHS_ Urban-Rural Classification:**

- [`NCHSurbruralcodes`](https://www.cdc.gov/nchs/data_access/urban_rural.htm):
"NCHS Urban-Rural Classification Scheme for Counties : 2013 codes".

**Federal Highway Administration:**

[`National Household Travel Survey`](https://nhts.ornl.gov/person-miles):
"Person Miles of Travel"

**Pew Reserach:**

[`Mobile Phone Factsheet`](https://www.pewresearch.org/internet/fact-sheet/mobile/):


In [None]:
CENSUS_DATA_ROOT = './data/clean_census_data/'


In [None]:
# Import population pyramid:
census_population = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S0101_population.csv")

# Import employment data:
census_employment = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S2301_employment.csv")

# Import income data:
census_income = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S2503_income.csv")

# Import phone onwership data:
census_internet = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S2801_internet.csv")

# Import geography data:
census_geography = pd.read_csv(CENSUS_DATA_ROOT+"cb_2018_us_county_500k.csv")

# Import commuting data:
census_commuting = pd.read_csv(CENSUS_DATA_ROOT+"ACSCommutingFlows.csv")

# Import urban/rural data:
census_urban_rural = pd.read_csv(CENSUS_DATA_ROOT+"NCHSURCodes2013_urbanrural.csv")


In [None]:
# Get geography data:
data = census_geography[['AFFGEOID','NAME','ALAND']].rename(columns={
    'AFFGEOID' : 'id',
    'NAME' : 'county_name',
    'ALAND' : 'area_square_meters',
})
# Get population data:
data = data.merge(
    census_population[['id','Total Population']],
    left_on=['id'], right_on=['id'],
)
# Get employment data:
data = data.merge(
    census_employment,
    left_on=['id'], right_on=['id'],
)
# Get urban/rural data:
data = data.merge(
    census_urban_rural.rename(columns={
        '2013 code' : 'urbanrural_code_2013', 'type_2013' : 'urbanrural_type_2013',
    })[[
        'id','urbanrural_code_2013','urbanrural_type_2013'
    ]],
    left_on=['id'], right_on=['id'],
)
# Add commuting data:
data = data.merge(
    census_commuting[['id','State Name','County Name','number_work_in_county','number_work_out_of_county']],
    left_on=['id'], right_on=['id'],
)
# Add income data:
data = data.merge(
    census_income[[
        'id',
        #'Less than $10,000',
        #'$10,000 to $14,999', 
        #'$15,000 to $24,999',
        #'$25,000 to $34,999',
        #'$35,000 to $49,999',
        '$50,000 to $74,999',
        '$75,000 to $99,999',
        '$100,000 to $149,999',
        '$150,000 to $199,999',
        '$200,000 or more',
        'Median income (dollars)',
        'Mean income (dollars)',
    ]],
    left_on=['id'], right_on=['id'],
)
# Add phone ownership:
data = data.merge(
    census_internet[['id','smartphone_ownership']],
    left_on=['id'], right_on=['id'],
)
# Compute metrics:
data['pct_above_income_50000'] = census_income[[
    #'Less than $10,000',
    #'$10,000 to $14,999', 
    #'$15,000 to $24,999',
    #'$25,000 to $34,999',
    #'$35,000 to $49,999',
    '$50,000 to $74,999',
    '$75,000 to $99,999',
    '$100,000 to $149,999',
    '$150,000 to $199,999',
    '$200,000 or more',
]].sum(axis=1)
data['pop_density'] = data['Total Population']/(data['area_square_meters']/1e6)
data['pct_work_out_of_county'] = data['number_work_out_of_county']/(
    data['number_work_in_county']+data['number_work_out_of_county']
)/100
data['Labor Force Participation Rate'] = data['Labor Force Participation Rate'].replace('N',np.nan)
data['Labor Force Participation Rate'] = data['Labor Force Participation Rate'].astype(float)/100
data['urban_rural'] = np.where(data['urbanrural_code_2013'].isin([1,2,3]),'urban','rural')
# Rename columns:
data = data.rename(columns={'id':'county_id'})

data


### Build `location_attributes` table:


- For `population`, we used total population in age/gender breakdown (from ACS). 

- For `density`, we classifed counties as `urban` if they had code 1, 2, or 3 in the NCHS data and `rural` if they had code 4, 5, or 6 (NCHS classification).

- For `wealth_rate`, we took the proportion of population in household with income of 50,000 or more (from ACS).

- For `employment_rate`, we used labor force participation rate (from ACS).


In [None]:
location_attributes = data.copy().rename(columns={
    'county_id' : 'location_id',
    #'county_name' : 'location_county',
    'County Name' : 'location_county',
    'State Name' : 'location_state',
    'Geographic Area Name' : 'location_name',
    'Total Population' : 'population',
    'urban_rural' : 'density',
    'Labor Force Participation Rate' : 'employment_rate',
    'pct_above_income_50000' : 'wealth_rate',
})[[
    'location_id',
    'location_county',
    'location_state',
    'location_name',
    'population',
    'density',
    'employment_rate',
    'wealth_rate',
]]
location_attributes


### Build `location_attributes` table:

- We calculated population density by using the county boundary shapefiles and used it to calibrate parameters of work/social/grocery travel distances.
