### Explore how to extract data from the uscensus api

In [1]:
import requests
import pandas as pd
import json
import os

In [3]:
%load_ext dotenv
%dotenv

In [4]:
api_key=os.environ['API_KEY']

### Groups used: (demographics, social, economics, housing)
```
demo_groups = ['B01001', 'B05003', 'B02015', 'DP05', 'B03001']

soci_groups = ['B00002', 'B11002', 'B04006', 'B11009', 'B05005', 
                'C16001', 'B05006', 'DP02', 'B07204', 'S1810']
                
econ_groups = ['B00001', 'B19025', 'B00002', 'B19101', 'B01001', 
                'B19201', 'B08013', 'B19313', 'B17010', 'B20005', 
                'B17024', 'DP03', 'B19001', 'S1701']
                
hous_group = ['B00002', 'B25070', 'B25004', 'B25075', 
                'B25008', 'DP04', 'B25063']
```

In [5]:
demo_groups = ['B01001', 'B05003', 'B02015', 'DP05', 'B03001']

soci_groups = ['B00002', 'B11002', 'B04006', 'B11009', 'B05005', 
                'C16001', 'B05006', 'DP02', 'B07204', 'S1810']
                
econ_groups = ['B00001', 'B19025', 'B00002', 'B19101', 'B01001', 
                'B19201', 'B08013', 'B19313', 'B17010', 'B20005', 
                'B17024', 'DP03', 'B19001', 'S1701']
                
hous_groups = ['B00002', 'B25070', 'B25004', 'B25075', 'B25008',
                'DP04', 'B25063']

all_groups = demo_groups + soci_groups + econ_groups + hous_groups

In [6]:
groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/groups.json').content)
profile_groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/profile/groups.json').content)
cprofile_groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/cprofile/groups.json').content)
subject_groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/subject/groups.json').content)

In [7]:
meta = pd.DataFrame(groups['groups']+profile_groups['groups']+cprofile_groups['groups']+subject_groups['groups'])

In [8]:
meta['description'] = meta['description'].apply(lambda x: x.lower())

In [9]:
meta = meta[meta['name'].isin(all_groups)]

In [10]:
endpoint_lookup = {
    'B':'',
    'C':'', 
    'S':'/subject', 
    'D':'/profile'
}
meta['endpoint'] = meta['name'].apply(lambda x: endpoint_lookup.get(x.strip()[0]))

In [11]:
meta

Unnamed: 0,name,description,variables,endpoint
21,B17010,poverty status in the past 12 months of famili...,https://api.census.gov/data/2017/acs/acs5/grou...,
37,B07204,geographical mobility in the past year for cur...,https://api.census.gov/data/2017/acs/acs5/grou...,
68,B05003,sex by age by nativity and citizenship status,https://api.census.gov/data/2017/acs/acs5/grou...,
70,B05006,place of birth for the foreign-born population...,https://api.census.gov/data/2017/acs/acs5/grou...,
72,B05005,period of entry by nativity and citizenship st...,https://api.census.gov/data/2017/acs/acs5/grou...,
197,B19201,nonfamily household income in the past 12 mont...,https://api.census.gov/data/2017/acs/acs5/grou...,
200,B17024,age by ratio of income to poverty level in the...,https://api.census.gov/data/2017/acs/acs5/grou...,
218,B03001,hispanic or latino origin by specific origin,https://api.census.gov/data/2017/acs/acs5/grou...,
278,B25008,total population in occupied housing units by ...,https://api.census.gov/data/2017/acs/acs5/grou...,
284,B25004,vacancy status,https://api.census.gov/data/2017/acs/acs5/grou...,


#### Extract each group of data for different geographic levels:
Idea: given group and endpoint, extract all variables assosiated with that group

In [12]:
def get_tract(group, endpoint):
    frames = []
    for county in ['081', '085', '005', '047', '061']:
        url = f'https://api.census.gov/data/2017/acs/acs5{endpoint}?get=group({group})&for=tract:*&in=state:36&in=county:{county}&key={api_key}'
        resp = requests.request('GET', url).content
        df = pd.DataFrame(json.loads(resp)[1:])
        df.columns = json.loads(resp)[0]
        frames.append(df)
    return pd.concat(frames)

def get_puma(group, endpoint):
    url = f'https://api.census.gov/data/2017/acs/acs5{endpoint}?get=group({group})&for=Public Use Microdata Area:*&in=state:36&key={api_key}'
    resp = requests.request('GET', url).content
    df = pd.DataFrame(json.loads(resp)[1:])
    df.columns = json.loads(resp)[0]
    return df[df['NAME'].str.startswith('NYC')]


def get_borough(group, endpoint):
    url = f'https://api.census.gov/data/2017/acs/acs5{endpoint}?get=group({group})&for=county:*&in=state:36&key={api_key}'
    try:
        resp = requests.request('GET', url).content
        df = pd.DataFrame(json.loads(resp)[1:])
        df.columns = json.loads(resp)[0]
        return df
    except: 
        print(url)

def get_city(group,endpoint):
    url = f'https://api.census.gov/data/2017/acs/acs5{endpoint}?get=group({group})&for=place:51000&in=state:36&key={api_key}'
    resp = requests.request('GET', url).content
    df = pd.DataFrame(json.loads(resp)[1:])
    df.columns = json.loads(resp)[0]
    return df

In [45]:
get_tract('B19025', '').head()

Unnamed: 0,GEO_ID,B19025_001E,B19025_001M,NAME,B19025_001EA,B19025_001MA,state,county,tract
0,1400000US36081003700,-666666666,-222222222,"Census Tract 37, Queens County, New York",-,**,36,81,3700
1,1400000US36081009900,-666666666,-222222222,"Census Tract 99, Queens County, New York",-,**,36,81,9900
2,1400000US36081010701,-666666666,-222222222,"Census Tract 107.01, Queens County, New York",-,**,36,81,10701
3,1400000US36081017100,-666666666,-222222222,"Census Tract 171, Queens County, New York",-,**,36,81,17100
4,1400000US36081022900,-666666666,-222222222,"Census Tract 229, Queens County, New York",-,**,36,81,22900


In [46]:
get_city('B19025', '').head()

Unnamed: 0,GEO_ID,B19025_001E,B19025_001M,NAME,B19025_001EA,B19025_001MA,state,place
0,1600000US3651000,292860702600,2269561153,"New York city, New York",,,36,51000


In [47]:
get_puma('B19025', '').head()

Unnamed: 0,GEO_ID,B19025_001E,B19025_001M,NAME,B19025_001EA,B19025_001MA,state,public use microdata area
1,7950000US3604104,4359443900,130746875,"NYC-Queens Community District 11--Bayside, Dou...",,,36,4104
2,7950000US3604103,6459349600,168313957,"NYC-Queens Community District 7--Flushing, Mur...",,,36,4103
3,7950000US3604114,2967368300,105584826,NYC-Queens Community District 14--Far Rockaway...,,,36,4114
8,7950000US3603810,17200671100,560477049,NYC-Manhattan Community District 1 & 2--Batter...,,,36,3810
32,7950000US3604110,5169001200,103566191,"NYC-Queens Community District 5--Ridgewood, Gl...",,,36,4110


#### Given category (demographics, social, economics, housing), and method, create one big table (contianing all groups) for each geographic level

In [13]:
def create_table(group, getter):
    frames = []
    for i in group:
        df = getter(i, meta.loc[meta['name']==i]['endpoint'].values[0])
        frames.append(df)
    df = frames[0]
    df = df[df.columns.difference(['place', 'tract', 'state', 'county', 'public use microdata area'])]
    for i in frames[1:]:
        df = pd.merge(df, i[i.columns.difference(['state', 'county', 'tract', 'NAME', 'place', 'public use microdata area'])], 
                      left_on='GEO_ID', right_on='GEO_ID')
    df = df[[i for i in list(df.columns) if i[-1:] != 'A']]
    return df

In [49]:
create_table(hous_groups, get_borough).head()

Unnamed: 0,B00002_001E,GEO_ID,NAME,B25070_001E,B25070_001M,B25070_002E,B25070_002M,B25070_003E,B25070_003M,B25070_004E,...,B25063_023E,B25063_023M,B25063_024E,B25063_024M,B25063_025E,B25063_025M,B25063_026E,B25063_026M,B25063_027E,B25063_027M
0,2794,0500000US36095,"Schoharie County, New York",3068,262,140,66,247,78,301,...,7,12,14,19,0,24,0,24,375,90
1,32864,0500000US36005,"Bronx County, New York",397698,2130,11373,865,24163,1247,34584,...,12766,741,1812,288,701,191,433,135,10065,799
2,18944,0500000US36067,"Onondaga County, New York",65234,1115,2751,309,5631,584,8026,...,736,165,201,85,151,77,137,96,2027,253
3,2835,0500000US36035,"Fulton County, New York",6450,482,210,106,439,134,665,...,0,27,0,27,0,27,33,49,676,175
4,3319,0500000US36019,"Clinton County, New York",10138,424,508,170,1077,216,1031,...,26,26,46,47,23,20,0,27,720,175


#### Combine different levels of data, including city, borough, puma and census tracts

In [14]:
def create_big_table(group):
    frames = []
    frames.append(create_table(group, get_borough))
    frames.append(create_table(group, get_tract))
    frames.append(create_table(group, get_city))
    frames.append(create_table(group, get_puma))
    return pd.concat(frames)

In [15]:
demo_df = create_big_table(demo_groups)

In [16]:
demo_df.head()

Unnamed: 0,B01001_001E,B01001_001M,B01001_002E,B01001_002M,B01001_003E,B01001_003M,B01001_004E,B01001_004M,B01001_005E,B01001_005M,...,B03001_027E,B03001_027M,B03001_028E,B03001_028M,B03001_029E,B03001_029M,B03001_030E,B03001_030M,B03001_031E,B03001_031M
0,31611,-555555555,15700,61,711,25,843,84,777,85,...,68,33,31,23,13,14,0,24,24,20
1,1455846,-555555555,685636,195,55036,142,51974,1319,49368,1298,...,18154,1629,1698,383,1041,421,111,120,15304,1395
2,467669,-555555555,225685,101,13830,72,14245,516,14537,517,...,1833,321,649,223,412,229,0,27,772,209
3,53955,-555555555,26899,112,1474,59,1649,153,1626,168,...,172,111,24,24,39,33,0,27,109,104
4,81224,-555555555,41648,104,2056,79,2112,200,2062,196,...,287,83,71,57,4,6,0,27,212,65


In [17]:
demo_df.shape

(2285, 614)

In [18]:
hous_df = create_big_table(hous_groups)

In [19]:
hous_df.shape

(2285, 727)

In [20]:
soci_df = create_big_table(soci_groups)

In [21]:
soci_df.shape

(2285, 1761)

In [22]:
econ_df = create_big_table(econ_groups)

In [23]:
econ_df.shape

(2285, 1662)

In [24]:
demo_df.to_csv('data/demo.csv', index=False)
hous_df.to_csv('data/hous.csv', index=False)
soci_df.to_csv('data/soci.csv', index=False)
econ_df.to_csv('data/econ.csv', index=False)