In [66]:
from factfinder.calculate import Calculate
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd

In [None]:
##INPUTS -- change here
pff_variable = 'mdvl'
geotype = 'CT20'
census_geoid_list = ['36047035301']

In [68]:
pd.options.display.float_format = "{:,.18f}".format

In [69]:
try:
    env_path = "../.env"
    load_dotenv(dotenv_path=env_path)
except:
    print(".env file is missing ...")

In [70]:
calculate = Calculate(
        api_key=os.environ["API_KEY"], year=2019, source="acs", geography='2010_to_2020'
    )

In [71]:
# See all digits of ratio
ratio = calculate.geo.ratio
print(ratio.dtypes)
ratio.loc[ratio.geoid_ct2020.isin(census_geoid_list), :]


geoid_ct2010     object
geoid_ct2020     object
ratio           float64
dtype: object


Unnamed: 0,geoid_ct2010,geoid_ct2020,ratio
125,36005018302,36005018302,1.0


In [72]:
# Get ranges and design factor from metadata
ranges = calculate.meta.median_ranges(pff_variable)
print(ranges)
design_factor = calculate.meta.median_design_factor(pff_variable)
print(f"\nDesign factor: {design_factor}")

{'ovlu10': [0, 9999], 'ovl10t14': [10000, 14999], 'ovl15t19': [15000, 19999], 'ovl20t24': [20000, 24999], 'ovl25t29': [25000, 29999], 'ovl30t34': [30000, 34999], 'ovl35t39': [35000, 39999], 'ovl40t49': [40000, 49999], 'ovl50t59': [50000, 59999], 'ovl60t69': [60000, 69999], 'ovl70t79': [70000, 79999], 'ovl80t89': [80000, 89999], 'ovl90t99': [90000, 99999], 'ov100t124': [100000, 124999], 'ov125t149': [125000, 149999], 'ov150t174': [150000, 174999], 'ov175t199': [175000, 199999], 'ov200t249': [200000, 249999], 'ov250t299': [250000, 299999], 'ov300t399': [300000, 399999], 'ov400t499': [400000, 499999], 'ov500t749': [500000, 749999], 'ov750t999': [750000, 999999], 'ov1t149m': [1000000, 1499999], 'ov150t199m': [1500000, 1999999], 'ov2milpl': [2000000, 5000000]}

Design factor: 1.4


In [73]:
# Calculate inputs in 2020 geogs
df = calculate.calculate_e_m_multiprocessing(list(ranges.keys()), geotype)
print(df.dtypes)
print(df.loc[df.census_geoid.isin(census_geoid_list), :])

census_geoid     object
pff_variable     object
geotype          object
e               float64
m               float64
dtype: object
    census_geoid pff_variable geotype                     e  \
120  36005018302       ovlu10    CT20  0.000000000000000000   
120  36005018302     ovl10t14    CT20  0.000000000000000000   
120  36005018302     ovl15t19    CT20  0.000000000000000000   
120  36005018302     ovl20t24    CT20  0.000000000000000000   
120  36005018302     ovl25t29    CT20  0.000000000000000000   
120  36005018302     ovl30t34    CT20  0.000000000000000000   
120  36005018302     ovl35t39    CT20  0.000000000000000000   
120  36005018302     ovl40t49    CT20  0.000000000000000000   
120  36005018302     ovl50t59    CT20  0.000000000000000000   
120  36005018302     ovl60t69    CT20  0.000000000000000000   
120  36005018302     ovl70t79    CT20  0.000000000000000000   
120  36005018302     ovl80t89    CT20  0.000000000000000000   
120  36005018302     ovl90t99    CT20  0.000000

In [74]:
# 3. create a pivot table with census_geoid as the index, and
# pff_variable as column names. df_pivoted.e -> the estimation dataframe
df_pivoted = df.loc[df.census_geoid.isin(census_geoid_list), ["census_geoid", "pff_variable", "e"]].pivot(
    index="census_geoid", columns="pff_variable", values=["e"]
)
print(df_pivoted.dtypes)
df_pivoted = df_pivoted.round(16)
print(df_pivoted)

   pff_variable
e  ov100t124       float64
   ov125t149       float64
   ov150t174       float64
   ov150t199m      float64
   ov175t199       float64
   ov1t149m        float64
   ov200t249       float64
   ov250t299       float64
   ov2milpl        float64
   ov300t399       float64
   ov400t499       float64
   ov500t749       float64
   ov750t999       float64
   ovl10t14        float64
   ovl15t19        float64
   ovl20t24        float64
   ovl25t29        float64
   ovl30t34        float64
   ovl35t39        float64
   ovl40t49        float64
   ovl50t59        float64
   ovl60t69        float64
   ovl70t79        float64
   ovl80t89        float64
   ovl90t99        float64
   ovlu10          float64
dtype: object
                                e                                             \
pff_variable            ov100t124            ov125t149             ov150t174   
census_geoid                                                                   
36005018302  0.0000000000000

In [75]:
# Empty dataframe to store the results
results = pd.DataFrame()
results["census_geoid"] = df_pivoted.index
results["pff_variable"] = pff_variable
results["geotype"] = geotype
results

Unnamed: 0,census_geoid,pff_variable,geotype
0,36005018302,mdvl,CT20


In [76]:
def get_median(ranges, row):
    ordered = list(ranges.keys())
    N = row[ordered].sum()
    print(f"\n\nN/2: {N/2}")
    C = 0
    i = 0
    while C < N / 2 and i <= len(ranges.keys()) - 1:
        print(f"\nRange i: {i}")
        C += row[ordered[i]]
        print(f"Cumulative frequency C: {C}")
        i += 1
    i = i - 1
    if i == 0:
        print("N/2 is in first range")
        median = list(ranges.values())[0][1]
        print(f"Median: {median}")
    elif C == 0.0:
        print("Cumulative frequency is 0")
        median = 0.0
        print(f"Median: {median}")
    elif i == len(ranges.keys()) - 1:
        print("N/2 is in top range")
        median = list(ranges.values())[-1][0]
        print(f"Median: {median}")
    else:
        print(f"\nN/2 is in range {i}")
        print(f"Range {i}:", ranges[ordered[i]])
        C = C - row[ordered[i]]
        print(f"C_i-1: {C}")
        L = ranges[ordered[i]][0]
        print(f"L_i: {L}")
        F = row[ordered[i]]
        print(f"F_i: {F}")
        W = ranges[ordered[i]][1] - ranges[ordered[i]][0]
        print(f"W_i: {W}")
        median = L + (N / 2 - C) * W / F
        print(f"Median: {median}")
    return median

In [77]:
def get_median_moe(ranges, row, DF=1.1):
    md = row["e"]
    print("\n\n=======")
    print(f"Median: {md}\n")
    if md >= list(ranges.values())[-1][0]:
        print("Median is above top bin lower value")
        return np.nan
    else:
        ordered = list(ranges.keys())
        B = row[ordered].sum()
        if B == 0:
            print("Size of base is zero")
            return np.nan
        else:
            cumm_dist = list(np.cumsum(row[ordered]) / B * 100)
            print(f"Cumulative dist:\n {cumm_dist}")

            se_50 = DF * (((93 / (7 * B)) * 2500)) ** 0.5
            print(f"SE of 50%: {se_50}\n\n")

            if se_50 >= 50:
                return np.nan
            else:
                p_lower = 50 - se_50
                print(f"p_lower: {p_lower}")
                p_upper = 50 + se_50
                print(f"p_upper: {p_upper}")

                lower_bin = min([cumm_dist.index(i) for i in cumm_dist if i > p_lower])
                print(f"Bin containing p_lower: {lower_bin}")
                upper_bin = min([cumm_dist.index(i) for i in cumm_dist if i > p_upper])
                print(f"Bin containing p_upper: {upper_bin}")

                if lower_bin >= len(ordered) - 1:
                    return np.nan
                else:
                    if lower_bin == upper_bin:
                        print("\nBoth bounds are in the same bin\n")
                        A1 = min(ranges[ordered[lower_bin]])
                        print(f"Smallest value in the bin: {A1}")
                        A2 = min(ranges[ordered[lower_bin + 1]])
                        print(f"Largest value in the bin: {A2}")
                        C1 = cumm_dist[lower_bin - 1]
                        print(f"Cumulative percent of units less than smallest value: {C1}")
                        C2 = cumm_dist[lower_bin]
                        print(f"Cumulative percent of units less than largest value: {C2}")
                        lowerbound = (p_lower - C1) * (A2 - A1) / (C2 - C1) + A1
                        upperbound = (p_upper - C1) * (A2 - A1) / (C2 - C1) + A1
                        print(f"Confidence interval: {lowerbound} to {upperbound}")

                    else:
                        print("\nBounds are in different bins\n")
                        if lower_bin == 0:
                            A1_l = min(ranges[ordered[lower_bin]])
                            print(f"Smallest value in the lower bin: {A1_l}")
                            A2_l = min(ranges[ordered[lower_bin + 1]])
                            print(f"Largest value in the lower bin: {A2_l}")
                            C1_l = 0.0
                            print(f"Cumulative percent of units less than lower bin smallest value: {C1_l}")
                            C2_l = cumm_dist[lower_bin]
                            print(f"Cumulative percent of units less than lower bin largest value: {C2_l}")

                        else:
                            A1_l = min(ranges[ordered[lower_bin]])
                            print(f"Smallest value in the lower bin: {A1_l}")
                            A2_l = min(ranges[ordered[lower_bin + 1]])
                            print(f"Largest value in the lower bin: {A2_l}")
                            C1_l = cumm_dist[lower_bin - 1]
                            print(f"Cumulative percent of units less than lower bin smallest value: {C1_l}")
                            C2_l = cumm_dist[lower_bin]
                            print(f"Cumulative percent of units less than lower bin largest value: {C2_l}")

                        if upper_bin + 1 > len(ordered) - 1:
                            print("\nUpper bound is in top bin")
                            A1_u = min(ranges[ordered[upper_bin]])
                            print(f"Smallest value in the upper bin: {A1_u}")
                            A2_u = A1_u
                            print(f"Largest value in the upper bin: {A2_u}")
                            C1_u = cumm_dist[upper_bin - 1]
                            print(f"Cumulative percent of units less than upper bin smallest value: {C1_u}")
                            C2_u = cumm_dist[upper_bin]
                            print(f"Cumulative percent of units less than upper bin largest value: {C2_u}")

                        else:
                            print("\nUpper bound is below top bin")
                            A1_u = min(ranges[ordered[upper_bin]])
                            print(f"Smallest value in the upper bin: {A1_u}")
                            A2_u = min(ranges[ordered[upper_bin + 1]])
                            print(f"Largest value in the upper bin: {A2_u}")
                            C1_u = cumm_dist[upper_bin - 1]
                            print(f"Cumulative percent of units less than upper bin smallest value: {C1_u}")
                            C2_u = cumm_dist[upper_bin]
                            print(f"Cumulative percent of units less than upper bin largest value: {C2_u}")

                        lowerbound = (p_lower - C1_l) * (A2_l - A1_l) / (
                            C2_l - C1_l
                        ) + A1_l
                        upperbound = (p_upper - C1_u) * (A2_u - A1_u) / (
                            C2_u - C1_u
                        ) + A1_u
                        print(f"Confidence interval: {lowerbound} to {upperbound}")

                    print(f"MOE: {(upperbound - lowerbound) * 1.645 / 2}")
                    return (upperbound - lowerbound) * 1.645 / 2


In [78]:
# 4. calculate median estimation using get_median
results["e"] = (
    df_pivoted.e.loc[
        df_pivoted.e.index == results.census_geoid, list(ranges.keys())
    ]
    .apply(lambda row: get_median(ranges, row), axis=1)
    .to_list()
)



N/2: 35.0

Range i: 0
Cumulative frequency C: 0.0

Range i: 1
Cumulative frequency C: 0.0

Range i: 2
Cumulative frequency C: 0.0

Range i: 3
Cumulative frequency C: 0.0

Range i: 4
Cumulative frequency C: 0.0

Range i: 5
Cumulative frequency C: 0.0

Range i: 6
Cumulative frequency C: 0.0

Range i: 7
Cumulative frequency C: 0.0

Range i: 8
Cumulative frequency C: 0.0

Range i: 9
Cumulative frequency C: 0.0

Range i: 10
Cumulative frequency C: 0.0

Range i: 11
Cumulative frequency C: 0.0

Range i: 12
Cumulative frequency C: 0.0

Range i: 13
Cumulative frequency C: 0.0

Range i: 14
Cumulative frequency C: 5.0

Range i: 15
Cumulative frequency C: 35.0

N/2 is in range 16
Range 16: [175000, 199999]
C_i-1: 35.0
L_i: 175000
F_i: 0.0
W_i: 24999
Median: nan


  median = L + (N / 2 - C) * W / F


In [79]:
# 5. Calculate median moe using get_median_moe
# Note that median moe calculation needs the median estimation
# so we seperated df_pivoted.m out as a seperate dataframe
e = df_pivoted.e.copy()
e["e"] = results.loc[e.index == results.census_geoid, "e"].to_list()
results["m"] = (
    e.loc[e.index == results.census_geoid, list(ranges.keys()) + ["e"]]
    .apply(lambda row: get_median_moe(ranges, row, design_factor), axis=1)
    .to_list()
)



Median: nan

Cumulative dist:
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.142857142857142, 50.0, 50.0, 87.14285714285714, 87.14285714285714, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]
SE of 50%: 30.49590136395381


p_lower: 19.50409863604619
p_upper: 80.4959013639538
Bin containing p_lower: 15
Bin containing p_upper: 17

Bounds are in different bins

Smallest value in the lower bin: 150000
Largest value in the lower bin: 175000
Cumulative percent of units less than lower bin smallest value: 7.142857142857142
Cumulative percent of units less than lower bin largest value: 50.0

Upper bound is below top bin
Smallest value in the upper bin: 200000
Largest value in the upper bin: 250000
Cumulative percent of units less than upper bin smallest value: 50.0
Cumulative percent of units less than upper bin largest value: 87.14285714285714
Confidence interval: 157210.72420436027 to 241052.17491301475
MOE: 68959.59320786831


In [80]:
print(results.dtypes)
results.head()

census_geoid     object
pff_variable     object
geotype          object
e               float64
m               float64
dtype: object


Unnamed: 0,census_geoid,pff_variable,geotype,e,m
0,36005018302,mdvl,CT20,,68959.59320786831


In [81]:
# Peform full calculation (including cleaning/rounding) to show display output

full_calc = calculate(pff_variable, geotype)
print(full_calc.dtypes)
print(full_calc.loc[full_calc.census_geoid.isin(census_geoid_list),:])

census_geoid     object
labs_geoid       object
geotype          object
labs_geotype     object
pff_variable     object
c               float64
e               float64
m               float64
p               float64
z               float64
dtype: object
    census_geoid labs_geoid geotype labs_geotype pff_variable  \
120  36005018302    2018302    CT20       CT2020         mdvl   

                        c                          e  \
120 21.000000000000000000 200,000.000000000000000000   

                            m   p   z  
120 68,960.000000000000000000 nan nan  
