In [1]:
import pandas as pd
import regex as re
import numpy as np
import censusdata as cd

In [4]:
get_fips = lambda row: ''.join([row['index'].params()[i][1] for i in range(3)])

req_vars = {
    'B02009_001E': "cnt_black", 
    'B01003_001E': "pop", 
    'B19001_016E': "150-200_income", 
    'B19001_017E': "250-*_income", 
    'B01002_001E': "med_age",
    'B25010_001E': 'ave_hh_size',
    'B25010_002E': 'ave_owner_hh_size',
    'B25010_003E': 'ave_renter_hh_size',
    'B25032_001E': 'tot_units_1',
    'B25032_002E': 'tot_owner_units',
    'B25032_013E': 'tot_renter_units',
    'B25008_001E': 'tot_pop_occup',
    'B25008_002E': "owner_pop_occup",
    'B25008_003E': "renter_pop_occup",
    'B06011_001E': "med_income",
    'B19013_001E': "med_hh_income",
    'B25031_001E': "med_gross_rent",
    'B25041_001E': "tot_units_2",
    'B25041_007E': "geq_5rooms"
}

# Download the desired variables for all ZIPs in the state
cdata = cd.download('acs5', 2019,
                    cd.censusgeo([('zip code tabulation area', '*')]),
                    list(req_vars.keys())) \
    .rename(columns = req_vars) \
    .reset_index()
     
# Replace all missings
cdata = cdata.replace(-666666666.0, np.nan)
        
# Drop ZIPs with a negative population
cdata = cdata[cdata['pop'] > 0].reset_index(drop = True)
cdata['zip'] = cdata['index'].apply(lambda x: x.name).astype(str).str[-5:]

# Rename and calculate the final census metrics
cdata['cns_median_age'] = cdata['med_age']
cdata['cns_median_income'] = cdata['med_income']
cdata['cns_median_hh_inc'] = cdata['med_hh_income']
cdata['cns_median_rent'] = cdata['med_gross_rent']
cdata['cns_pop'] = cdata['pop']
cdata['cns_leq_5_units'] = cdata['geq_5rooms']/cdata['tot_units_2']
cdata.loc[cdata['cns_leq_5_units'] == np.inf, 'cns_leq_5_units'] = np.nan
cdata['cns_black_ratio'] = cdata['cnt_black']/cdata['pop']
cdata['cns_rich_ratio'] = (cdata['150-200_income'] + cdata['250-*_income'])/cdata['pop']
cdata['cns_renter_ratio'] = cdata['tot_renter_units']/cdata['tot_units_1']

# Select the relevant variables and store to .dta
cdata = cdata[[col for col in cdata.columns if 'cns' in col] + ['zip']]
cdata.to_stata(r"..\Data\Intermediate\census.dta", version = 119)

In [54]:
# Download the population for all ZIPs over the period 2014-2019
start = True
for year in [2014, 2019]:
    pop_temp = cd.download('acs5', year,
                        cd.censusgeo([('zip code tabulation area', '*')]),
                        ['B01003_001E']) \
        .rename(columns = {'B01003_001E': 'population'}) \
        .reset_index()
    pop_temp['year'] = year
    
    if start: 
        pop_df, start = pop_temp, False
        print("New:\t", year, "\t\tSize: ", len(pop_df))
    else: 
        pop_df = pop_df.append(pop_temp)
        print("Append:\t", year, "\t\tSize: ", len(pop_df))

# Replace all missings
pop_df = pop_df.replace(-666666666.0, np.nan)
        
# Drop ZIPs with a negative population
pop_df = pop_df[pop_df['population'] > 0].reset_index(drop = True)
pop_df['zip'] = pop_df['index'].apply(lambda x: x.name).astype(str).str[-5:]

# Calculate the change in population between the ACS5-2014 and the ACS5-2019
pop_df['log_chg'] = pop_df.groupby("zip")['population'].apply(lambda x: np.log(x) - np.log(x.shift()))

# Keep only the relevant variables
pop_df \
    .loc[pop_df['log_chg'].notna(), ['zip', 'log_chg']] \
    .to_stata(r"..\Data\Intermediate\secular_pop_growth.dta", version = 119, write_index = False)

New:	 2014 		Size:  33120
Append:	 2019 		Size:  66240
