In [1]:
#Initialize
from factfinder.calculate import Calculate
from factfinder.median import Median
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd

# pd.options.display.float_format = "{:,.18f}".format

try:
    env_path = "../.env"
    load_dotenv(dotenv_path=env_path)
except:
    print(".env file is missing ...")

calculate = Calculate(api_key=os.environ["API_KEY"], year=2010, source="acs", geography='2010_to_2020')

In [2]:
##INPUTS -- change here
pff_variable = 'mdhhinc'
geotype = 'CT20'
census_geoid_list = ['36005043501']

In [3]:
# See all digits of ratio
ratio = calculate.geo.ratio
ratio.loc[ratio.geoid_ct2020.isin(census_geoid_list), :]


Unnamed: 0,geoid_ct2010,geoid_ct2020,ratio
349,36005043500,36005043501,1.0


In [4]:
# Get ranges and design factor from metadata
ranges = calculate.meta.median_ranges(pff_variable)
design_factor = calculate.meta.median_design_factor(pff_variable)
print(f"\nDesign factor: {design_factor}")


Design factor: 1.5


In [5]:
# Perfom e and m median calculation by calling calculate method
df = calculate.calculate_e_m_median(pff_variable, geotype)
df.head()


Unnamed: 0_level_0,census_geoid,pff_variable,geotype,e,m
census_geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
36005000100,36005000100,mdhhinc,CT20,0.0,
36005000200,36005000200,mdhhinc,CT20,59697.825301,16981.384701
36005000400,36005000400,mdhhinc,CT20,68407.334211,14254.802499
36005001600,36005001600,mdhhinc,CT20,30503.371528,5877.238287
36005001901,36005001901,mdhhinc,CT20,25164.440789,28494.309073


In [6]:
df.loc[df.census_geoid.isin(census_geoid_list),:]

Unnamed: 0_level_0,census_geoid,pff_variable,geotype,e,m
census_geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
36005043501,36005043501,mdhhinc,CT20,15208.291667,17961.681329


In [7]:
# Peform full calculation (including cleaning/rounding) to show display output
full_calc = calculate(pff_variable, geotype)
full_calc.loc[full_calc.census_geoid.isin(census_geoid_list),:]

Unnamed: 0_level_0,census_geoid,labs_geoid,geotype,labs_geotype,pff_variable,c,e,m,p,z
census_geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
36005043501,36005043501,2043501,CT20,CT2020,mdhhinc,71.8,15208.0,17962.0,,


In [8]:
# Calculate inputs in 2020 geogs
df = calculate.calculate_e_m_multiprocessing(list(ranges.keys()), geotype)

In [9]:
# 3. create a pivot table with census_geoid as the index, and
# pff_variable as column names. df_pivoted.e -> the estimation dataframe
df_pivoted = df.loc[df.census_geoid.isin(census_geoid_list), ["census_geoid", "pff_variable", "e"]].pivot(
    index="census_geoid", columns="pff_variable", values=["e"]
)
df_pivoted = df_pivoted.round(16)
df_pivoted

Unnamed: 0_level_0,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e
pff_variable,mdhhi10t14,mdhhi15t19,mdhhi200pl,mdhhi20t24,mdhhi25t29,mdhhi30t34,mdhhi35t39,mdhhi40t44,mdhhi45t49,mdhhi50t59,mdhhi60t74,mdhhi75t99,mdhhiu10,mdhi100t124,mdhi125t149,mdhi150t199
census_geoid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
36005043501,0.0,12.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0


In [10]:
# Empty dataframe to store the results
results = pd.DataFrame()
results["census_geoid"] = df_pivoted.index
results["pff_variable"] = pff_variable
results["geotype"] = geotype
results

Unnamed: 0,census_geoid,pff_variable,geotype
0,36005043501,mdhhinc,CT20


In [11]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

# 4. calculate median estimation using get_median
results["e"] = (
    df_pivoted.e.loc[
        df_pivoted.e.index == results.census_geoid, list(ranges.keys())
    ]
    .apply(lambda row: Median(ranges, row, design_factor).median, axis=1)
    .to_list()
)

DEBUG:root:test
DEBUG:root:N/2 is in range [15000, 19999]
DEBUG:root:
C_1: Cumulative frequency up to bin below N/2: 20.0
L_2: Lower boundary of median group: 15000
F_2: Frequency within median group: 12.0
W_2: Width of median group: 4999
DEBUG:root:
MEDIAN: 15208.291666666666



In [12]:
# 5. Calculate median moe using get_median_moe
# Note that median moe calculation needs the median estimation
# so we seperated df_pivoted.m out as a seperate dataframe
e = df_pivoted.e.copy()
e["e"] = results.loc[e.index == results.census_geoid, "e"].to_list()
results["m"] = (
    e.loc[e.index == results.census_geoid, list(ranges.keys()) + ["e"]]
    .apply(lambda row: Median(ranges, row, design_factor).median_moe, axis=1)
    .to_list()
)

DEBUG:root:N/2 is in range [15000, 19999]
DEBUG:root:
C_1: Cumulative frequency up to bin below N/2: 20.0
L_2: Lower boundary of median group: 15000
F_2: Frequency within median group: 12.0
W_2: Width of median group: 4999
DEBUG:root:
MEDIAN: 15208.291666666666

DEBUG:root:
            UPPER_BOUND:
            -----
            A1=20000, A2=25000, C1=78.04878048780488, C2=100.0
            
DEBUG:root:lower_bin in bottom bin
DEBUG:root:lower_bin not in bottom bin and is the first none-zero bin
DEBUG:root:
            LOWER_BOUND:
            -----
            A1=0, A2=10000, C1=0.0, C2=48.78048780487805
            
DEBUG:root:N/2 is in range [15000, 19999]
DEBUG:root:
C_1: Cumulative frequency up to bin below N/2: 20.0
L_2: Lower boundary of median group: 15000
F_2: Frequency within median group: 12.0
W_2: Width of median group: 4999
DEBUG:root:
MEDIAN: 15208.291666666666

DEBUG:root:
MEDIAN STATS:
-----
Median = 15208.291666666666
Median_MOE = 17961.681329225772
B = 41.0
se_50 = 42.6

In [13]:
results.head()

Unnamed: 0,census_geoid,pff_variable,geotype,e,m
0,36005043501,mdhhinc,CT20,15208.291667,17961.681329
