In [24]:
from factfinder.calculate import Calculate
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd

In [25]:
##INPUTS
pff_variable = 'mdhhinc'
geotype = 'CT20'
census_geoid_list = ['36005001901', '36005001902']

In [26]:
pd.options.display.float_format = "{:,.18f}".format

In [27]:
try:
    env_path = "../.env"
    load_dotenv(dotenv_path=env_path)
except:
    print(".env file is missing ...")

In [28]:
calculate = Calculate(
        api_key=os.environ["API_KEY"], year=2019, source="acs", geography='2010_to_2020'
    )

In [29]:
# See all digits of ratio
ratio = calculate.geo.ratio
print(ratio.dtypes)
ratio.loc[ratio.geoid_ct2020.isin(census_geoid_list), :]


geoid_ct2010     object
geoid_ct2020     object
ratio           float64
dtype: object


Unnamed: 0,geoid_ct2010,geoid_ct2020,ratio
4,36005001900,36005001901,0.245696400625978
5,36005001900,36005001902,0.754303599374022


In [30]:
# Get ranges and design factor from metadata
ranges = calculate.meta.median_ranges(pff_variable)
print(ranges)
design_factor = calculate.meta.median_design_factor(pff_variable)
print(f"\nDesign factor: {design_factor}")

{'mdhhiu10': [0, 9999], 'mdhhi10t14': [10000, 14999], 'mdhhi15t19': [15000, 19999], 'mdhhi20t24': [20000, 24999], 'mdhhi25t29': [25000, 29999], 'mdhhi30t34': [30000, 34999], 'mdhhi35t39': [35000, 39999], 'mdhhi40t44': [40000, 44999], 'mdhhi45t49': [45000, 49999], 'mdhhi50t59': [50000, 59999], 'mdhhi60t74': [60000, 74999], 'mdhhi75t99': [75000, 99999], 'mdhi100t124': [100000, 124999], 'mdhi125t149': [125000, 149999], 'mdhi150t199': [150000, 199999], 'mdhhi200pl': [200000, 9999999]}

Design factor: 1.5


In [31]:
# Calculate inputs in 2020 geogs
df = calculate.calculate_e_m_multiprocessing(list(ranges.keys()), geotype)
print(df.dtypes)
print(df.loc[df.census_geoid.isin(census_geoid_list), :])

ConnectionError: None: Max retries exceeded with url: /data/2019/acs/acs5?get=NAME%2CB19001_002E%2CB19001_002M&for=tract%3A%2A&key=6093c30de833695dc90f205f035f0d2eaae58f29&in=state%3A36+county%3A005 (Caused by None)

In [None]:
# 3. create a pivot table with census_geoid as the index, and
# pff_variable as column names. df_pivoted.e -> the estimation dataframe
df_pivoted = df.loc[df.census_geoid.isin(census_geoid_list), ["census_geoid", "pff_variable", "e"]].pivot(
    index="census_geoid", columns="pff_variable", values=["e"]
)
print(df_pivoted.dtypes)
df_pivoted = df_pivoted.round(16)
print(df_pivoted)

In [None]:
# Empty dataframe to store the results
results = pd.DataFrame()
results["census_geoid"] = df_pivoted.index
results["pff_variable"] = pff_variable
results["geotype"] = geotype
results

In [None]:
def get_median(ranges, row):
    ordered = list(ranges.keys())
    N = row[ordered].sum()
    print(f"\n\nN/2: {N/2}")
    C = 0
    i = 0
    while C <= N / 2 and i <= len(ranges.keys()) - 1:
        print(f"\nRange i: {i}")
        C += row[ordered[i]]
        print(f"Cumulative frequency C: {C}")
        i += 1
    i = i - 1
    if i == 0:
        print("N/2 is in first range")
        median = list(ranges.values())[0][1]
        print(f"Median: {median}")
    elif C == 0.0:
        print("Cumulative frequency is 0")
        median = 0.0
        print(f"Median: {median}")
    elif i == len(ranges.keys()) - 1:
        print("N/2 is in top range")
        median = list(ranges.values())[-1][0]
        print(f"Median: {median}")
    else:
        print(f"\nN/2 is in range {i}")
        print(f"Range {i}:", ranges[ordered[i]])
        C = C - row[ordered[i]]
        print(f"C_i-1: {C}")
        L = ranges[ordered[i]][0]
        print(f"L_i: {L}")
        F = row[ordered[i]]
        print(f"F_i: {F}")
        W = ranges[ordered[i]][1] - ranges[ordered[i]][0]
        print(f"W_i: {W}")
        median = L + (N / 2 - C) * W / F
        print(f"Median: {median}")
    return median

In [None]:
def get_median_moe(ranges, row, DF=1.1):
    md = row["e"]
    print("\n\n=======")
    print(f"Median: {md}\n")
    if md >= list(ranges.values())[-1][0]:
        print("Median is above top bin lower value")
        return np.nan
    else:
        ordered = list(ranges.keys())
        B = row[ordered].sum()
        if B == 0:
            print("Size of base is zero")
            return np.nan
        else:
            cumm_dist = list(np.cumsum(row[ordered]) / B * 100)
            print(f"Cumulative dist:\n {cumm_dist}")

            se_50 = DF * (((93 / (7 * B)) * 2500)) ** 0.5
            print(f"SE of 50%: {se_50}\n\n")

            if se_50 >= 50:
                return np.nan
            else:
                p_lower = 50 - se_50
                print(f"p_lower: {p_lower}")
                p_upper = 50 + se_50
                print(f"p_upper: {p_upper}")

                lower_bin = min([cumm_dist.index(i) for i in cumm_dist if i > p_lower])
                print(f"Bin containing p_lower: {lower_bin}")
                upper_bin = min([cumm_dist.index(i) for i in cumm_dist if i > p_upper])
                print(f"Bin containing p_upper: {upper_bin}")

                if lower_bin >= len(ordered) - 1:
                    return np.nan
                else:
                    if lower_bin == upper_bin:
                        print("\nBoth bounds are in the same bin\n")
                        A1 = min(ranges[ordered[lower_bin]])
                        print(f"Smallest value in the bin: {A1}")
                        A2 = min(ranges[ordered[lower_bin + 1]])
                        print(f"Largest value in the bin: {A2}")
                        C1 = cumm_dist[lower_bin - 1]
                        print(f"Cumulative percent of units less than smallest value: {C1}")
                        C2 = cumm_dist[lower_bin]
                        print(f"Cumulative percent of units less than largest value: {C2}")
                        lowerbound = (p_lower - C1) * (A2 - A1) / (C2 - C1) + A1
                        upperbound = (p_upper - C1) * (A2 - A1) / (C2 - C1) + A1
                        print(f"Confidence interval: {lowerbound} to {upperbound}")

                    else:
                        print("\nBounds are in different bins\n")
                        A1_l = min(ranges[ordered[lower_bin]])
                        print(f"Smallest value in the lower bin: {A1_l}")
                        A2_l = min(ranges[ordered[lower_bin + 1]])
                        print(f"Largest value in the lower bin: {A2_l}")
                        C1_l = cumm_dist[lower_bin - 1]
                        print(f"Cumulative percent of units less than lower bin smallest value: {C1_l}")
                        C2_l = cumm_dist[lower_bin]
                        print(f"Cumulative percent of units less than lower bin largest value: {C2_l}")

                        if upper_bin + 1 > len(ordered) - 1:
                            print("\nUpper bound is in top bin")
                            A1_u = min(ranges[ordered[upper_bin]])
                            print(f"Smallest value in the upper bin: {A1_u}")
                            A2_u = A1_u
                            print(f"Largest value in the upper bin: {A2_u}")
                            C1_u = cumm_dist[upper_bin - 1]
                            print(f"Cumulative percent of units less than upper bin smallest value: {C1_u}")
                            C2_u = cumm_dist[upper_bin]
                            print(f"Cumulative percent of units less than upper bin largest value: {C2_u}")
                        else:
                            print("\nUpper bound is below top bin")
                            A1_u = min(ranges[ordered[upper_bin]])
                            print(f"Smallest value in the upper bin: {A1_u}")
                            A2_u = min(ranges[ordered[upper_bin + 1]])
                            print(f"Largest value in the upper bin: {A2_u}")
                            C1_u = cumm_dist[upper_bin - 1]
                            print(f"Cumulative percent of units less than upper bin smallest value: {C1_u}")
                            C2_u = cumm_dist[upper_bin]
                            print(f"Cumulative percent of units less than upper bin largest value: {C2_u}")

                        lowerbound = (p_lower - C1_l) * (A2_l - A1_l) / (
                            C2_l - C1_l
                        ) + A1_l
                        upperbound = (p_upper - C1_u) * (A2_u - A1_u) / (
                            C2_u - C1_u
                        ) + A1_u
                        print(f"Confidence interval: {lowerbound} to {upperbound}")

                    print(f"MOE: {(upperbound - lowerbound) * 1.645 / 2}")
                    return (upperbound - lowerbound) * 1.645 / 2


In [None]:
# 4. calculate median estimation using get_median
results["e"] = (
    df_pivoted.e.loc[
        df_pivoted.e.index == results.census_geoid, list(ranges.keys())
    ]
    .apply(lambda row: get_median(ranges, row), axis=1)
    .to_list()
)

In [None]:
# 5. Calculate median moe using get_median_moe
# Note that median moe calculation needs the median estimation
# so we seperated df_pivoted.m out as a seperate dataframe
e = df_pivoted.e.copy()
e["e"] = results.loc[e.index == results.census_geoid, "e"].to_list()
results["m"] = (
    e.loc[e.index == results.census_geoid, list(ranges.keys()) + ["e"]]
    .apply(lambda row: get_median_moe(ranges, row, design_factor), axis=1)
    .to_list()
)

In [None]:
print(results.dtypes)
results.head()

In [None]:
# Peform full calculation (including cleaning/rounding) to show display output

full_calc = calculate(pff_variable, geotype)
print(full_calc.dtypes)
print(full_calc.loc[full_calc.census_geoid.isin(census_geoid_list),:])