In [1]:
import requests
import pandas as pd
import numpy as np
import functools 
import math
import json

In [2]:
df = pd.read_csv('../data/econ.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,B00001_001E,GEO_ID,NAME,B19025_001E,B19025_001M,B00002_001E,B19101_001E,B19101_001M,B19101_002E,B19101_002M,...,S1701_C03_057E,S1701_C03_057M,S1701_C03_058E,S1701_C03_058M,S1701_C03_059E,S1701_C03_059M,S1701_C03_060E,S1701_C03_060M,S1701_C03_061E,S1701_C03_061M
0,178502.0,0500000US36081,"Queens County, New York",63270660000.0,563876300.0,66895.0,525106,3428,21011,1074,...,27.3,1.2,-888888888,-888888888,3.4,0.3,34.5,1.2,50.0,1.0
1,34049.0,0500000US36085,"Richmond County, New York",15931290000.0,229209800.0,12571.0,123292,1275,6557,565,...,27.5,3.9,-888888888,-888888888,2.3,0.8,29.7,3.9,47.5,2.0
2,88079.0,0500000US36005,"Bronx County, New York",26615270000.0,281519900.0,32864.0,326296,2480,35225,1291,...,35.9,1.9,-888888888,-888888888,5.3,0.5,42.3,1.7,64.4,1.1
3,204504.0,0500000US36047,"Kings County, New York",76310580000.0,705539800.0,79792.0,585611,3176,37298,1130,...,42.6,1.3,-888888888,-888888888,3.1,0.2,32.1,0.9,62.9,0.7
4,79095.0,0500000US36061,"New York County, New York",110732900000.0,1685630000.0,39216.0,320700,3617,17678,1226,...,26.8,1.4,-888888888,-888888888,2.0,0.2,26.3,1.1,53.6,1.1


In [4]:
with open('../data/econ_meta_lookup.json', 'r') as f:
    meta_lookup = json.load(f)

In [5]:
list(meta_lookup.keys())[:5]

['Pop25t29', 'F16pl', 'FPop0t5', 'CvNIPop2', 'MPop35t39']

In [6]:
meta_lookup['MdPop67t69']

['B01001_045', 'B01001_021']

### Formula:
https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf

In [17]:
# e --> estimate, m --> moe
def get_e(e):
    return sum(e)

def get_m(m):
    return math.sqrt(sum(map(lambda x: x**2, m)))

def get_c(e, m): 
    if e == 0:
        return ''
    else:
        return m/1.645/e*100

def get_p(e, agg_e):
    if agg_e == 0: 
        return ''
    else:
        return e/agg_e*100

def get_z(e, m, p, agg_e, agg_m):
    if p == 0:
        return ''
    elif p == 100:
        return ''
    elif agg_e == 0:
        return ''
    elif m**2 - (e*agg_m/agg_e)**2 <0:
        return math.sqrt(m**2 + (e*agg_m/agg_e)**2)/agg_e*100
    else: 
        return math.sqrt(m**2 - (e*agg_m/agg_e)**2)/agg_e*100

In [18]:
def find_total(variable, stat='E'):
    if variable[0] == 'B' or variable[0] == 'C' : 
        return f"{variable.split('_')[0]}_001{stat}"
    elif variable[0] == 'D': 
        return f"{variable.split('_')[0]}_0001{stat}"
    else: #S1810_C01_001M
        return f"{'_'.join(variable.split('_')[:2])}_001{stat}"

In [19]:
%%time
i = 'MdEFFTWrk'
variables = meta_lookup[i]
all_columns = list(df.columns)
e_variables = list(map(lambda x: all_columns.index(f'{x}E'), variables))
m_variables = list(map(lambda x: all_columns.index(f'{x}M'), variables))
total_e = find_total(variables[0], 'E')
total_m = find_total(variables[0], 'M')
dff = df.values

CPU times: user 79.8 ms, sys: 39.3 ms, total: 119 ms
Wall time: 121 ms


In [21]:
%%time
df.loc[:,f'{i}E'] = np.apply_along_axis(get_e, 1, dff[:, e_variables])
df.loc[:,f'{i}M'] = np.apply_along_axis(get_m, 1, dff[:, m_variables])

df.loc[:,f'{i}C'] = df.apply(lambda row: get_c(row[f'{i}E'], row[f'{i}M']), axis=1)

if len(variables) == 1 and f'{variables[0]}PE' in df.columns:
    '''
    If for some of the records PE is already calculated, 
    then take them directly and calculate PE for the rest
    '''
    df.loc[:,f'{i}P'] \
        = df.loc[df[f'{variables[0]}PE'].isna(), :]\
            .apply(lambda row: get_p(row[f'{i}E'], row[total_e]), axis=1)    
    
    df.loc[:,f'{i}P']\
        = df.loc[~df[f'{variables[0]}PE'].isna(), :]\
            .loc[:,f'{variables[0]}PE']
else: 
    df.loc[:,f'{i}P']\
        = df.apply(lambda row: get_p(row[f'{i}E'], row[total_e]), axis=1)

if len(variables) == 1 and f'{variables[0]}PM' in df.columns:
    '''
    If for some of the records PM is already calculated, 
    then take them directly and calculate PM for the rest
    '''
    df.loc[:,f'{i}Z']\
        = df.loc[df[f'{variables[0]}PM'].isna(), :]\
            .apply(lambda row: get_z(row[f'{i}E'], 
                                     row[f'{i}M'], 
                                     row[f'{i}P'], 
                                     row[total_e],
                                     row[total_m]), axis=1)

    df.loc[:,f'{i}Z']\
        = df.loc[~df[f'{variables[0]}PM'].isna(), :]\
            .loc[:,f'{variables[0]}PM']
else:
    df.loc[:,f'{i}Z']\
        = df.apply(lambda row: get_z(row[f'{i}E'], 
                                     row[f'{i}M'], 
                                     row[f'{i}P'], 
                                     row[total_e],
                                     row[total_m]), axis=1)

CPU times: user 443 ms, sys: 112 ms, total: 555 ms
Wall time: 574 ms
