In [52]:
import requests
import pandas as pd
import numpy as np
import functools 
import math
import json

In [53]:
df = pd.read_csv('data/econ.csv', index_col=False)

In [54]:
df.head()

Unnamed: 0,B19025_001E,B19025_001M,GEO_ID,NAME,B19101_001E,B19101_001M,B19101_002E,B19101_002M,B19101_003E,B19101_003M,...,S1701_C03_057E,S1701_C03_057M,S1701_C03_058E,S1701_C03_058M,S1701_C03_059E,S1701_C03_059M,S1701_C03_060E,S1701_C03_060M,S1701_C03_061E,S1701_C03_061M
0,819507400.0,33260066.0,0500000US36095,"Schoharie County, New York",7928,268,277,64,293,106,...,16.7,5.0,-888888888,-888888888,4.6,2.0,41.8,7.0,36.4,5.4
1,26615270000.0,281519926.0,0500000US36005,"Bronx County, New York",326296,2480,35225,1291,23720,1264,...,35.9,1.9,-888888888,-888888888,5.3,0.5,42.3,1.7,64.4,1.1
2,14232430000.0,188307367.0,0500000US36067,"Onondaga County, New York",113541,1115,5098,462,3249,353,...,15.5,1.8,-888888888,-888888888,2.6,0.5,39.2,2.3,43.9,1.6
3,1395835000.0,55793102.0,0500000US36035,"Fulton County, New York",14438,413,754,183,433,132,...,13.3,5.3,-888888888,-888888888,1.8,1.2,35.3,6.6,41.2,4.2
4,2072981000.0,57860345.0,0500000US36019,"Clinton County, New York",20084,563,973,212,828,234,...,11.8,4.2,-888888888,-888888888,4.2,2.4,49.6,6.0,41.2,4.0


In [55]:
with open('data/econ_meta_lookup.json', 'r') as f:
    meta_lookup = json.load(f)

In [56]:
list(meta_lookup.keys())[:5]

['Pop25t29', 'F16pl', 'FPop0t5', 'CvNIPop2', 'MPop35t39']

### Formula:
https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf

In [42]:
# e --> estimate, m --> moe
def get_e(e):
    return sum(e)

def get_m(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
#     return math.sqrt(sum([i**2 for i in m]))
    return result

def get_c(e, m): 
    if e == 0:
        return ''
    else:
        return m/1.645/e*100

def get_p(e, agg_e):
    if agg_e == 0: 
        return ''
    else:
        return e/agg_e*100

def get_z(e, m, p, agg_e, agg_m):
    if p == 0:
        return ''
    elif p == 100:
        return ''
    elif agg_e == 0:
        return ''
    elif m**2 - (e*agg_m/agg_e)**2 <0:
        return math.sqrt(m**2 + (e*agg_m/agg_e)**2)/agg_e*100
    else: 
        return math.sqrt(m**2 - (e*agg_m/agg_e)**2)/agg_e*100

In [43]:
def find_total(variable, stat='E'):
    if variable[0] == 'B' or variable[0] == 'C' : 
        return f"{variable.split('_')[0]}_001{stat}"
    elif variable[0] == 'D': 
        return f"{variable.split('_')[0]}_0001{stat}"
    else: #S1810_C01_001M
        return f"{'_'.join(variable.split('_')[:2])}_001{stat}"

In [None]:
%%time
for i in meta_lookup.keys():
    variables = meta_lookup[i]
    e_variables = list(map(lambda x: f'{x}E', variables))
    m_variables = list(map(lambda x: f'{x}M', variables))
    total_e = find_total(variables[0], 'E')
    total_m = find_total(variables[0], 'M')
    df.loc[:,f'{i}E'] = df.apply(lambda row: get_e(row[e_variables].tolist()), axis=1)
    df.loc[:,f'{i}M'] = df.apply(lambda row: get_m(row[m_variables].tolist()), axis=1)
    df.loc[:,f'{i}C'] = df.apply(lambda row: get_c(row[f'{i}E'], row[f'{i}M']), axis=1)
    
    if len(variables) == 1 and f'{variables[0]}PE' in df.columns:
        '''
        If for some of the records PE is already calculated, 
        then take them directly and calculate PE for the rest
        '''
        df.loc[df[f'{variables[0]}PE'].isna(),f'{i}P'] \
            = df.loc[df[f'{variables[0]}PE'].isna(), :]\
                .apply(lambda row: get_p(row[f'{i}E'], row[total_e]), axis=1)
        
        df.loc[~df[f'{variables[0]}PE'].isna(),f'{i}P']\
            = df.loc[~df[f'{variables[0]}PE'].isna(), :]\
                .loc[:,f'{variables[0]}PE']
    else: 
        df.loc[:,f'{i}P']\
            = df.apply(lambda row: get_p(row[f'{i}E'], row[total_e]), axis=1)
        
    if len(variables) == 1 and f'{variables[0]}PM' in df.columns:
        '''
        If for some of the records PM is already calculated, 
        then take them directly and calculate PM for the rest
        '''
        df.loc[df[f'{variables[0]}PM'].isna(),f'{i}Z']\
            = df.loc[df[f'{variables[0]}PM'].isna(), :]\
                .apply(lambda row: get_z(row[f'{i}E'], 
                                         row[f'{i}M'], 
                                         row[f'{i}P'], 
                                         row[total_e],
                                         row[total_m]), axis=1)
        
        df.loc[~df[f'{variables[0]}PM'].isna(),f'{i}Z']\
            = df.loc[~df[f'{variables[0]}PM'].isna(), :]\
                .loc[:,f'{variables[0]}PM']
    else:
        df.loc[:,f'{i}Z']\
            = df.apply(lambda row: get_z(row[f'{i}E'], 
                                         row[f'{i}M'], 
                                         row[f'{i}P'], 
                                         row[total_e],
                                         row[total_m]), axis=1)

In [38]:
output_cols = sum([[i+'E', i+'M', i+'P', i+'Z', i+'C'] for i in meta_lookup.keys()], []) + ['GEO_ID', 'NAME']

Unnamed: 0,B01001_001E,B01001_001M,B01001_002E,B01001_002M,B01001_003E,B01001_003M,B01001_004E,B01001_004M,B01001_005E,B01001_005M,...,HspColE,HspColM,HspColC,HspColP,HspColZ,MPop85plE,MPop85plM,MPop85plC,MPop85plP,MPop85plZ
0,32127.0,1246.122386,14961.0,712.721545,694.0,178.555314,937.0,224.926655,1020.0,250.339769,...,,,,,,,,,,
1,57718.0,1775.077463,27570.0,1175.599422,2768.0,516.036820,1932.0,331.407604,1993.0,323.620148,...,,,,,,,,,,
2,29990.0,1432.393452,13655.0,964.579701,869.0,237.491052,1232.0,321.619340,1003.0,315.680218,...,,,,,,,,,,
3,38083.0,1482.243570,18710.0,1032.483898,1660.0,380.988189,1240.0,297.465964,1169.0,288.263768,...,,,,,,,,,,
4,35856.0,1580.290163,15986.0,1231.871341,1703.0,411.582313,1559.0,310.049996,1327.0,271.239746,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475,110156.0,2105.000000,49872.0,1306.000000,3231.0,406.000000,3187.0,414.000000,3103.0,397.000000,...,,,,,,,,,,
2476,165028.0,3738.000000,78678.0,2396.000000,5668.0,688.000000,4796.0,608.000000,4047.0,412.000000,...,,,,,,,,,,
2477,244302.0,3220.000000,114364.0,1921.000000,8609.0,519.000000,7039.0,586.000000,8091.0,615.000000,...,,,,,,,,,,
2478,138868.0,3101.000000,67901.0,1953.000000,4287.0,587.000000,3429.0,481.000000,4395.0,448.000000,...,,,,,,,,,,


#### Notes: 
for this following group, we do have e.g. Percent Margin of Error, maybe we shouldn't calculate these
```json
"DP05_0089PM": {
    "label": "Percent Margin of Error!!CITIZEN VOTING AGE POPULATION!!Citizen 18 and over population!!Female",
    "concept": "ACS DEMOGRAPHIC AND HOUSING ESTIMATES",
    "predicateType": "float",
    "group": "DP05",
    "limit": 0,
    "predicateOnly": true
},
```
https://api.census.gov/data/2017/acs/acs5/profile/groups/DP05.json

In [46]:
df[output].to_csv('data/demo_final.csv', index=False)