###### Imports and Settings

In [7]:
import pandas as pd
import numpy as np
import requests
import pickle
from collections import deque
from functools import reduce
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 150)
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [8]:
import sys
sys.path.append("../../../Functions and Dictionaries") # Adds higher directory to python modules path
import geodict
GNRC = geodict.GNRC
KY = geodict.KY
censusplaces = geodict.censusplaces

# American Community Survey 2017-2021 5 Year Estimates Census Tracts Only

In [9]:
#read in API key
with open('api_keys.pkl', 'rb') as keys_file:
        keys_dict_2 = pickle.load(keys_file)

In [10]:
#variable containing Census API key
api_key = keys_dict_2['CENSUS']

## Read In Data Guide

In [11]:
dataguide = pd.read_csv('../../Data Guides/DATA GUIDE ACS 2020_2021 5YR.csv', dtype = str)
dataguide['ID'] = dataguide['ID'].astype(int)

In [12]:
dg1 = dataguide[dataguide['ID'].between(1, 46)]
dg2 = dataguide[dataguide['ID'].between(47, 92)]
dg3 = dataguide[dataguide['ID'].between(93, 138)]
dg4 = dataguide[dataguide['ID'].between(139, 184)]
dg5 = dataguide[dataguide['ID'].between(185, 230)]
dg6 = dataguide[dataguide['ID'].between(231, 276)]
dg7 = dataguide[dataguide['ID'].between(277, 322)]
dg8 = dataguide[dataguide['ID'].between(323, 368)]
dg9 = dataguide[dataguide['ID'].between(369, 414)]
dg10 = dataguide[dataguide['ID'].between(415, 460)]
dg11 = dataguide[dataguide['ID'].between(461, 506)]
dg12 = dataguide[dataguide['ID'].between(507, 552)]
dg13 = dataguide[dataguide['ID'].between(553, 598)]
dg14 = dataguide[dataguide['ID'].between(599, 644)]
dg15 = dataguide[dataguide['ID'].between(645, 690)]
dg16 = dataguide[dataguide['ID'].between(691, 736)]
dg17 = dataguide[dataguide['ID'].between(737, 782)]
dg18 = dataguide[dataguide['ID'].between(783, 828)]
dg19 = dataguide[dataguide['ID'].between(829, 874)]
dg20 = dataguide[dataguide['ID'].between(875, 920)]
dg21 = dataguide[dataguide['ID'].between(921, 966)]
dg22 = dataguide[dataguide['ID'].between(967, 1012)]
dg23 = dataguide[dataguide['ID'].between(1013, 1058)]
dg24 = dataguide[dataguide['ID'].between(1059, 1104)]
dg25 = dataguide[dataguide['ID'].between(1105, 1150)]
dg26 = dataguide[dataguide['ID'].between(1151, 1196)]
dg27 = dataguide[dataguide['ID'].between(1197, 1242)]
dg28 = dataguide[dataguide['ID'].between(1243, 1287)]
dg29 = dataguide[dataguide['ID'].between(1288, 1332)]
dg30 = dataguide[dataguide['ID'].between(1333, 1377)]
dg31 = dataguide[dataguide['ID'].between(1378, 1422)]
dg32 = dataguide[dataguide['ID'].between(1423, 1467)]
dg33 = dataguide[dataguide['ID'].between(1468, 1512)]
dg34 = dataguide[dataguide['ID'].between(1513, 1557)]
dg35 = dataguide[dataguide['ID'].between(1558, 1602)]
dg36 = dataguide[dataguide['ID'].between(1603, 1647)]
dg37 = dataguide[dataguide['ID'].between(1648, 1692)]
dg38 = dataguide[dataguide['ID'].between(1693, 1737)]
dg39 = dataguide[dataguide['ID'].between(1738, 1782)]
dg40 = dataguide[dataguide['ID'].between(1783, 1827)]
dg41 = dataguide[dataguide['ID'].between(1828, 1872)]
dg42 = dataguide[dataguide['ID'].between(1873, 1917)]
dg43 = dataguide[dataguide['ID'].between(1918, 1962)]
dg44 = dataguide[dataguide['ID'].between(1963, 2007)]
dg45 = dataguide[dataguide['ID'].between(2008, 2052)]
dg46 = dataguide[dataguide['ID'].between(2053, 2097)]
dg47 = dataguide[dataguide['ID'].between(2098, 2142)]
dg48 = dataguide[dataguide['ID'].between(2143, 2187)]
dg49 = dataguide[dataguide['ID'].between(2188, 2232)]
dg50 = dataguide[dataguide['ID'].between(2233, 2277)]
dg51 = dataguide[dataguide['ID'].between(2278, 2322)]
dg52 = dataguide[dataguide['ID'].between(2323, 2367)]
dg53 = dataguide[dataguide['ID'].between(2368, 2412)]
dg54 = dataguide[dataguide['ID'].between(2413, 2457)]
dg55 = dataguide[dataguide['ID'].between(2458, 2502)]
dg56 = dataguide[dataguide['ID'].between(2503, 2547)]
dg57 = dataguide[dataguide['ID'].between(2548, 2592)]
dg58 = dataguide[dataguide['ID'].between(2593, 2637)]
dg59 = dataguide[dataguide['ID'].between(2638, 2682)]
dg60 = dataguide[dataguide['ID'].between(2683, 2727)]
dg61 = dataguide[dataguide['ID'].between(2728, 2772)]
dg62 = dataguide[dataguide['ID'].between(2773, 2817)]
dg63 = dataguide[dataguide['ID'].between(2818, 2862)]
dg64 = dataguide[dataguide['ID'].between(2863, 2907)]
dg65 = dataguide[dataguide['ID'].between(2908, 2952)]
dg66 = dataguide[dataguide['ID'].between(2953, 2997)]
dg67 = dataguide[dataguide['ID'].between(2998, 3042)]

In [22]:
dfs = [dg7, dg8]

In [43]:
#url string and list parameters for column head and tail
url_str= 'https://api.census.gov/data/2021/acs/acs5?key='+api_key
head1 = 'NAME' 
head2 = 'GEO_ID'
tail_cols1 = 'StateFIPS'
tail_cols2 = 'CountyFIPS'
tail_cols3 = 'CensusTract'
tail_cols4 = 'BlockGroup'

In [44]:
#tract
results = []
for df in dfs:
    dataguide = df
    for col_name, col_data in df.items():
        var_list = list(dataguide['Variable']) #make variables list
        var_list = deque(var_list)
        var_list.appendleft(head2)
        var_list.appendleft(head1)
        col_list = list(dataguide['Column Name']) #make columns list
        col_list.append(tail_cols1)
        col_list.append(tail_cols2)
        col_list.append(tail_cols3)
        #col_list.append(tail_cols4)
        col_list = deque(col_list)
        col_list.appendleft(head2)
        col_list.appendleft(head1)
        predicates= {} #block groups Tennessee
        get_vars= var_list
        predicates["get"]= ",". join(get_vars)
        predicates["for"]= "tract:*"
        predicates["in"]= "state:47, county:*"
        data = requests.get(url_str, params = predicates)
        col_names = col_list
        df = pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
        results.append(df)
new_df = pd.concat(results, axis = 1)
new_df = new_df.transpose().reset_index(drop = False).drop_duplicates()
new_df.columns = new_df.iloc[0]
new_df = new_df.set_index('NAME').transpose().drop(columns = ['StateFIPS','CountyFIPS', 'CensusTract']).reset_index(drop = True)
new_df = new_df.rename_axis(None, axis = 1)
print('Okay Finished')

Okay Finished


In [45]:
new_df.head(2)

Unnamed: 0,NAME,GEO_ID,housingcost_%ownercost40to49.9_wmortgage,housingcost_%ownercost50+_wmortgage,housingcost_total%ownercostwomortgage_series,housingcost_%ownercost30to34.9_womortgage,housingcost_%ownercost35to39.9_womortgage,housingcost_%ownercost40to49.9_womortgage,housingcost_%ownercost50+_womortgage,housingcost_total_rent%hhincome_series,housingcost_%rentercost30to34.9,housingcost_%rentercost35to39.9,housingcost_%rentercost40to49.9,housingcost_%rentercost50+,housing_medianvalue,housingcost_medvalue_ownerocc,housingcost_medvalue_ownerocc_wmortgage,housingcost_medvalue_ownerocc_womortgage,housingcost_mediangrossrent_renteroccupied,structures_total_yearbuilt_series,structures_built2014orlater,structures_built2010to2013,structures_built2000to2009,structures_built1990to1999,structures_built1980to1989,structures_built1970to1979,structures_built1960to1969,structures_built1950to1959,structures_built1940to1949,structures_built1939orearlier,structures_medianyearbuilt,commute_total_meansoftransportationtowork_series,commute_cartruckvan,commute_cartruckvan_drovealone,commute_cartruckvan_carpooled,commute_cartruckvan_carpooled_2ppl,commute_cartruckvan_carpooled_3ppl,commute_cartruckvan_carpooled_4ormoreppl,commute_publictransportation,commute_publictransportation_bus,commute_publictransportation_subwayorelevatedrail,commute_publictransportation_longdistancetrainorcommuterrail,commute_publictransportation_lightrailstreetcarortrolley,commute_publictransportation_ferryboat,commute_bicycle,commute_walk,commute_taxicabmotorcycleother,commute_workedfromhome,vehicles_total_series2,vehicles_none,vehicles_one,vehicles_two,vehicles_three,vehicles_four,vehicles_fiveormore,foreignborn_total,fb_europe,fb_eur_northern,fb_eur_n_denmark,fb_eur_n_ireland,fb_eur_n_norway,fb_eur_n_sweden,fb_eur_n_uk,fb_eur_n_excluding england and scotland,fb_eur_n_england,fb_eur_n_scotland,fb_eur_n_other,fb_eur_western,fb_eur_w_austria,fb_eur_w_belgium,fb_eur_w_france,fb_eur_w_germany,fb_eur_w_netherlands,fb_eur_w_switzerland,fb_eur_w_other,fb_eur_southern,fb_eur_s_greece,fb_eur_s_italy,fb_eur_s_portugal,fb_eur_s_azoresislands,fb_eur_s_spain,fb_eur_s_other,fb_eur_eastern,fb_eur_e_albania,fb_eur_e_belarus,fb_eur_e_bosniaandherzegovina,fb_eur_e_bulgaria,fb_eur_e_croatia,fb_eur_e_czechoslovakia,fb_eur_e_hungary,fb_eur_e_latvia,fb_eur_e_lithuania,fb_eur_e_moldova,fb_eur_e_macedonia
0,"Census Tract 201, Anderson County, Tennessee",1400000US47001020100,17,12,183,0,6,0,0,1144,75,115,58,250,208000,208000,234700,159700,825,1839,45,37,164,126,187,229,133,254,648,16,1960,1516,1346,1154,192,192,0,0,0,0,0,0,0,0,0,24,27,119,1516,98,731,473,157,27,30,163,55,18,0,0,0,18,0,0,0,0,0,37,0,0,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Census Tract 202.01, Anderson County, Tennessee",1400000US47001020201,33,15,620,16,0,27,8,714,38,9,7,107,312400,312400,324400,297900,1096,2069,0,448,341,388,518,334,33,7,0,0,1994,2137,1807,1791,16,16,0,0,0,0,0,0,0,0,0,9,9,312,2137,0,367,707,745,231,87,659,148,8,0,8,0,0,0,0,0,0,0,108,0,0,0,92,0,16,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,18,0,0,0,0


In [46]:
data = new_df

In [47]:
data = data[['NAME', 'GEO_ID', 'commute_total_meansoftransportationtowork_series', 'commute_workedfromhome']]

In [48]:
data.set_index(['NAME', 'GEO_ID'], inplace = True)

In [49]:
cols = data.columns
data[cols] = data[cols].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[cols] = data[cols].astype(float)


In [50]:
data = data.reset_index(drop = False)

In [51]:
data.head()

Unnamed: 0,NAME,GEO_ID,commute_total_meansoftransportationtowork_series,commute_workedfromhome
0,"Census Tract 201, Anderson County, Tennessee",1400000US47001020100,1516.0,119.0
1,"Census Tract 202.01, Anderson County, Tennessee",1400000US47001020201,2137.0,312.0
2,"Census Tract 202.02, Anderson County, Tennessee",1400000US47001020202,1552.0,127.0
3,"Census Tract 203, Anderson County, Tennessee",1400000US47001020300,1680.0,51.0
4,"Census Tract 204, Anderson County, Tennessee",1400000US47001020400,1866.0,81.0


In [32]:
#url string and list parameters for column head and tail
url_str= 'https://api.census.gov/data/2017/acs/acs5?key='+api_key
head1 = 'NAME' 
head2 = 'GEO_ID'
tail_cols1 = 'StateFIPS'
tail_cols2 = 'CountyFIPS'
tail_cols3 = 'CensusTract'
tail_cols4 = 'BlockGroup'
#tract
results = []
for df in dfs:
    dataguide = df
    for col_name, col_data in df.items():
        var_list = list(dataguide['Variable']) #make variables list
        var_list = deque(var_list)
        var_list.appendleft(head2)
        var_list.appendleft(head1)
        col_list = list(dataguide['Column Name']) #make columns list
        col_list.append(tail_cols1)
        col_list.append(tail_cols2)
        col_list.append(tail_cols3)
        #col_list.append(tail_cols4)
        col_list = deque(col_list)
        col_list.appendleft(head2)
        col_list.appendleft(head1)
        predicates= {} #block groups Tennessee
        get_vars= var_list
        predicates["get"]= ",". join(get_vars)
        predicates["for"]= "tract:*"
        predicates["in"]= "state:47, county:*"
        data = requests.get(url_str, params = predicates)
        col_names = col_list
        df = pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
        results.append(df)
new_df = pd.concat(results, axis = 1)
new_df = new_df.transpose().reset_index(drop = False).drop_duplicates()
new_df.columns = new_df.iloc[0]
new_df = new_df.set_index('NAME').transpose().drop(columns = ['StateFIPS','CountyFIPS', 'CensusTract']).reset_index(drop = True)
new_df = new_df.rename_axis(None, axis = 1)
print('Okay Finished')

Okay Finished


In [34]:
seventeen = new_df

In [35]:
seventeen = seventeen[['NAME', 'GEO_ID', 'commute_total_meansoftransportationtowork_series', 'commute_workedfromhome']]

In [39]:
seventeen.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,commute_total_meansoftransportationtowork_series Seventeen,commute_workedfromhome Seventeen
NAME,GEO_ID,Unnamed: 2_level_1,Unnamed: 3_level_1
"Census Tract 105.02, Hamilton County, Tennessee",1400000US47065010502,1645,60
"Census Tract 117, Hamilton County, Tennessee",1400000US47065011700,2425,203
"Census Tract 104.32, Hamilton County, Tennessee",1400000US47065010432,3194,127
"Census Tract 104.12, Hamilton County, Tennessee",1400000US47065010412,2502,87
"Census Tract 114.42, Hamilton County, Tennessee",1400000US47065011442,1388,87


In [37]:
seventeen.set_index(['NAME', 'GEO_ID'], inplace = True)

In [38]:
seventeen = seventeen.add_suffix(' Seventeen')

In [40]:
seventeen = seventeen.reset_index(drop = False)

In [64]:
df = data.merge(seventeen, on = ['NAME', 'GEO_ID'])

In [65]:
df.set_index(['NAME', 'GEO_ID'], inplace = True)
cols = df.columns
df[cols] = df[cols].astype(float)

In [66]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,commute_total_meansoftransportationtowork_series,commute_workedfromhome,commute_total_meansoftransportationtowork_series Seventeen,commute_workedfromhome Seventeen
NAME,GEO_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Census Tract 201, Anderson County, Tennessee",1400000US47001020100,1516.0,119.0,1463.0,23.0
"Census Tract 202.01, Anderson County, Tennessee",1400000US47001020201,2137.0,312.0,1951.0,95.0
"Census Tract 202.02, Anderson County, Tennessee",1400000US47001020202,1552.0,127.0,1814.0,76.0
"Census Tract 203, Anderson County, Tennessee",1400000US47001020300,1680.0,51.0,1859.0,79.0
"Census Tract 204, Anderson County, Tennessee",1400000US47001020400,1866.0,81.0,1657.0,91.0


In [67]:
df['Percent Change in WFH'] = ((df['commute_workedfromhome'] - df['commute_workedfromhome Seventeen'])/df['commute_workedfromhome Seventeen'])*100

In [68]:
df['Real Change in WFH'] = df['commute_workedfromhome'] - df['commute_workedfromhome Seventeen']

In [69]:
df = df[['Percent Change in WFH', 'Real Change in WFH', 'commute_workedfromhome']]

In [70]:
df.reset_index(drop = False, inplace = True)

In [73]:
df.head()

Unnamed: 0,NAME,GEO_ID,Percent Change in WFH,Real Change in WFH,commute_workedfromhome
0,"Census Tract 201, Anderson County, Tennessee",47001020100,417.391304,96.0,119.0
1,"Census Tract 202.01, Anderson County, Tennessee",47001020201,228.421053,217.0,312.0
2,"Census Tract 202.02, Anderson County, Tennessee",47001020202,67.105263,51.0,127.0
3,"Census Tract 203, Anderson County, Tennessee",47001020300,-35.443038,-28.0,51.0
4,"Census Tract 204, Anderson County, Tennessee",47001020400,-10.989011,-10.0,81.0


In [72]:
df['GEO_ID'] = df['GEO_ID'].str[9:]

In [74]:
df.to_csv('wfh17_21.csv', index = False)