In [60]:
import requests
import pandas as pd
import numpy as np
import json
import os

In [61]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [62]:
api_key=os.environ['API_KEY']

In [63]:
demo_groups = ['B01001', 'B05003', 'B02015', 'DP05', 
               'B03001']

soci_groups = ['B11002', 'B04006', 'B11009', 'B05005', 
               'B05006', 'DP02', 'B07204', 'S1810']
                
econ_groups = ['B19025', 'B19101', 'B01001', 'B19201', 
               'B08013', 'B19313', 'B17010', 'B20005', 
                'B17024', 'DP03', 'B19001', 'S1701']
                
hous_groups = ['B25070', 'B25004', 'B25075', 'B25008', 
              'DP04', 'B25063']

all_groups = demo_groups + soci_groups + econ_groups + hous_groups

In [7]:
groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/groups.json').content)
profile_groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/profile/groups.json').content)
cprofile_groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/cprofile/groups.json').content)
subject_groups = json.loads(requests.get('https://api.census.gov/data/2017/acs/acs5/subject/groups.json').content)

In [8]:
meta = pd.DataFrame(groups['groups']+profile_groups['groups']+cprofile_groups['groups']+subject_groups['groups'])

In [9]:
meta = meta[meta['name'].isin(all_groups)]

In [11]:
meta.head()

Unnamed: 0,name,description,variables
21,B17010,POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILI...,https://api.census.gov/data/2017/acs/acs5/grou...
37,B07204,GEOGRAPHICAL MOBILITY IN THE PAST YEAR FOR CUR...,https://api.census.gov/data/2017/acs/acs5/grou...
68,B05003,SEX BY AGE BY NATIVITY AND CITIZENSHIP STATUS,https://api.census.gov/data/2017/acs/acs5/grou...
70,B05006,PLACE OF BIRTH FOR THE FOREIGN-BORN POPULATION...,https://api.census.gov/data/2017/acs/acs5/grou...
72,B05005,PERIOD OF ENTRY BY NATIVITY AND CITIZENSHIP ST...,https://api.census.gov/data/2017/acs/acs5/grou...


#### Get variable definitions for each column

In [27]:
all_var = []
for i in meta.variables:
    r = json.loads(requests.get(i).content)['variables']
    for key, value in r.items():
        value.update(dict(variable=key))
        all_var.append(value)

In [30]:
df_meta = pd.DataFrame(all_var)
df_meta = df_meta[~df_meta['variable'].str.endswith('A')]

In [31]:
df_meta.head()

Unnamed: 0,label,predicateType,group,limit,predicateOnly,variable,concept
26,Margin of Error!!Total!!Income in the past 12 ...,int,B17010,0,True,B17010_020M,POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILI...
27,Estimate!!Total!!Income in the past 12 months ...,int,B17010,0,True,B17010_020E,POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILI...
28,Margin of Error!!Total!!Income in the past 12 ...,int,B17010,0,True,B17010_022M,POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILI...
29,Estimate!!Total!!Income in the past 12 months ...,int,B17010,0,True,B17010_022E,POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILI...
30,Margin of Error!!Total!!Income in the past 12 ...,int,B17010,0,True,B17010_021M,POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILI...


In [40]:
df_meta['label1'] = df_meta['label'].apply(lambda x: x.replace('Margin of Error', '').replace('Estimate', '').replace('!', ''))

In [43]:
df_meta.to_csv('data/meta.csv', index=False)

## Process manual lookup inputs

In [98]:
df = pd.read_csv('data/meta_lookup.csv')

In [99]:
df = df[~df.group.isin(['B00001', 'B00002', 'C16001'])]

In [100]:
df = df[['variable', 'group', 'schema', 'schema2', 'schema3']]

In [101]:
df.head()

Unnamed: 0,variable,group,schema,schema2,schema3
0,B17010_020M,B17010,,,
2,B17010_022M,B17010,,,
4,B17010_021M,B17010,,,
5,B01001_001E,B01001,MdPop_3,Pop,Pop_6
6,B17010_024M,B17010,,,


In [102]:
df_demo = df[df.group.isin(demo_groups)]
df_econ = df[df.group.isin(econ_groups)]
df_hous = df[df.group.isin(hous_groups)]
df_soci = df[df.group.isin(soci_groups)]

In [104]:
df_demo.head()

Unnamed: 0,variable,group,schema,schema2,schema3
5,B01001_001E,B01001,MdPop_3,Pop,Pop_6
7,B01001_002E,B01001,-,,
9,B01001_003E,B01001,MdPop0t4,MPop0t5,
11,B01001_004E,B01001,MdPop5t9,MPop5t9,
13,B01001_005E,B01001,MdPop10t14,MPop10t14,


In [126]:
def create_lookup(df, name):
    all_schema = list(set(df.schema.tolist() + df.schema2.tolist() + df.schema3.tolist()))
    
    try:
        all_schema.remove(np.nan)
    except: 
        pass
    
    all_schema = list(filter(lambda x: x != '-', all_schema))
    lookup = {}
    
    for i in all_schema: 
        a = df.loc[df.schema == i, 'variable'].tolist()
        b = df.loc[df.schema2 == i, 'variable'].tolist()
        c = df.loc[df.schema3 == i, 'variable'].tolist()
        variables = [i[:-1] for i in set(a+b+c)]
        lookup[i] = variables
        
    with open(f'data/{name}_meta_lookup.json', 'w', encoding='utf-8') as f:
        json.dump(lookup, f, ensure_ascii=False, indent=4)

In [128]:
create_lookup(df_demo, 'demo')
create_lookup(df_hous, 'hous')
create_lookup(df_soci, 'soci')
create_lookup(df_econ, 'econ')

In [52]:
all_schema = list(set(df.schema.tolist() + df.schema2.tolist() + df.schema3.tolist()))

In [53]:
all_schema.remove('-')
all_schema.remove(np.nan)

In [54]:
all_schema[1:5]

['OcHU4', 'EA_GrdPfD', 'SEAsia', 'C65plDVsn']

In [55]:
lookup = {}
for i in all_schema: 
    a = df.loc[df.schema == i, 'variable'].tolist()
    b = df.loc[df.schema2 == i, 'variable'].tolist()
    c = df.loc[df.schema3 == i, 'variable'].tolist()
    variables = [i[:-1] for i in set(a+b+c)]
    lookup[i] = variables

In [56]:
with open('data/meta_lookup.json', 'w', encoding='utf-8') as f:
    json.dump(lookup, f, ensure_ascii=False, indent=4)