In [40]:
from factfinder.calculate import Calculate
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd
from factfinder.median_moe import MedianMoe


In [41]:
##INPUTS -- change here
pff_variable = 'mdvl'
geotype = 'CT20'
census_geoid_list = ['36005018302']

In [42]:
pd.options.display.float_format = "{:,.18f}".format

In [43]:
try:
    env_path = "../.env"
    load_dotenv(dotenv_path=env_path)
except:
    print(".env file is missing ...")

In [44]:
calculate = Calculate(
        api_key=os.environ["API_KEY"], year=2019, source="acs", geography='2010_to_2020'
    )

In [45]:
# See all digits of ratio
ratio = calculate.geo.ratio
ratio.loc[ratio.geoid_ct2020.isin(census_geoid_list), :]


Unnamed: 0,geoid_ct2010,geoid_ct2020,ratio
125,36005018302,36005018302,1.0


In [46]:
# Get ranges and design factor from metadata
ranges = calculate.meta.median_ranges(pff_variable)
design_factor = calculate.meta.median_design_factor(pff_variable)
print(f"\nDesign factor: {design_factor}")


Design factor: 1.4


In [47]:
# Calculate inputs in 2020 geogs
df = calculate.calculate_e_m_multiprocessing(list(ranges.keys()), geotype)
df.head()

Unnamed: 0,census_geoid,pff_variable,geotype,e,m
0,36005000100,ovlu10,CT20,0.0,0.0
1,36005000200,ovlu10,CT20,15.0,24.0
2,36005000400,ovlu10,CT20,0.0,0.0
3,36005001600,ovlu10,CT20,0.0,0.0
4,36005001901,ovlu10,CT20,0.0,


In [48]:
# 3. create a pivot table with census_geoid as the index, and
# pff_variable as column names. df_pivoted.e -> the estimation dataframe
df_pivoted = df.loc[df.census_geoid.isin(census_geoid_list), ["census_geoid", "pff_variable", "e"]].pivot(
    index="census_geoid", columns="pff_variable", values=["e"]
)
df_pivoted = df_pivoted.round(16)
df_pivoted

Unnamed: 0_level_0,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e,e
pff_variable,ov100t124,ov125t149,ov150t174,ov150t199m,ov175t199,ov1t149m,ov200t249,ov250t299,ov2milpl,ov300t399,...,ovl25t29,ovl30t34,ovl35t39,ovl40t49,ovl50t59,ovl60t69,ovl70t79,ovl80t89,ovl90t99,ovlu10
census_geoid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
36005018302,0.0,5.0,30.0,0.0,0.0,0.0,26.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Empty dataframe to store the results
results = pd.DataFrame()
results["census_geoid"] = df_pivoted.index
results["pff_variable"] = pff_variable
results["geotype"] = geotype
results

Unnamed: 0,census_geoid,pff_variable,geotype
0,36005018302,mdvl,CT20


In [50]:
def get_median(ranges, row):
    ordered = list(ranges.keys())
    N = row[ordered].sum()
    print(f"\n\nN/2: {N/2}")
    C = 0
    i = 0
    while C < N / 2 and i <= len(ranges.keys()) - 1:
        print(f"\nRange i: {i}")
        C += row[ordered[i]]
        print(f"Cumulative frequency C: {C}")
        i += 1
    i = i - 1
    if i == 0:
        print("N/2 is in first range")
        median = list(ranges.values())[0][1]
        print(f"Median: {median}")
    elif C == 0.0:
        print("Cumulative frequency is 0")
        median = 0.0
        print(f"Median: {median}")
    elif i == len(ranges.keys()) - 1:
        print("N/2 is in top range")
        median = list(ranges.values())[-1][0]
        print(f"Median: {median}")
    else:
        print(f"\nN/2 is in range {i}")
        print(f"Range {i}:", ranges[ordered[i]])
        C = C - row[ordered[i]]
        print(f"C_i-1: {C}")
        L = ranges[ordered[i]][0]
        print(f"L_i: {L}")
        F = row[ordered[i]]
        print(f"F_i: {F}")
        W = ranges[ordered[i]][1] - ranges[ordered[i]][0]
        print(f"W_i: {W}")
        median = L + (N / 2 - C) * W / F
        print(f"Median: {median}")
    return median

In [51]:
# 4. calculate median estimation using get_median
results["e"] = (
    df_pivoted.e.loc[
        df_pivoted.e.index == results.census_geoid, list(ranges.keys())
    ]
    .apply(lambda row: get_median(ranges, row), axis=1)
    .to_list()
)



N/2: 35.0

Range i: 0
Cumulative frequency C: 0.0

Range i: 1
Cumulative frequency C: 0.0

Range i: 2
Cumulative frequency C: 0.0

Range i: 3
Cumulative frequency C: 0.0

Range i: 4
Cumulative frequency C: 0.0

Range i: 5
Cumulative frequency C: 0.0

Range i: 6
Cumulative frequency C: 0.0

Range i: 7
Cumulative frequency C: 0.0

Range i: 8
Cumulative frequency C: 0.0

Range i: 9
Cumulative frequency C: 0.0

Range i: 10
Cumulative frequency C: 0.0

Range i: 11
Cumulative frequency C: 0.0

Range i: 12
Cumulative frequency C: 0.0

Range i: 13
Cumulative frequency C: 0.0

Range i: 14
Cumulative frequency C: 5.0

Range i: 15
Cumulative frequency C: 35.0

N/2 is in range 15
Range 15: [150000, 174999]
C_i-1: 5.0
L_i: 150000
F_i: 30.0
W_i: 24999
Median: 174999.0


In [52]:
# 5. Calculate median moe using get_median_moe
# Note that median moe calculation needs the median estimation
# so we seperated df_pivoted.m out as a seperate dataframe
e = df_pivoted.e.copy()
e["e"] = results.loc[e.index == results.census_geoid, "e"].to_list()
results["m"] = (
    e.loc[e.index == results.census_geoid, list(ranges.keys()) + ["e"]]
    .apply(lambda row: MedianMoe(ranges, row, design_factor)(), axis=1)
    .to_list()
)

In [53]:
results.head()

Unnamed: 0,census_geoid,pff_variable,geotype,e,m
0,36005018302,mdvl,CT20,174999.0,68959.59320786831


In [54]:
# Peform full calculation (including cleaning/rounding) to show display output

full_calc = calculate(pff_variable, geotype)
full_calc.loc[full_calc.census_geoid.isin(census_geoid_list),:]

Unnamed: 0,census_geoid,labs_geoid,geotype,labs_geotype,pff_variable,c,e,m,p,z
120,36005018302,2018302,CT20,CT2020,mdvl,24.0,174999.0,68960.0,,
