In [56]:
from factfinder.calculate import Calculate
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd

In [57]:
pd.options.display.float_format = "{:,.18f}".format

In [58]:
try:
    env_path = "../.env"
    load_dotenv(dotenv_path=env_path)
except:
    print(".env file is missing ...")

In [59]:
calculate = Calculate(
        api_key=os.environ["API_KEY"], year=2019, source="acs", geography='2010_to_2020'
    )

In [60]:
# See all digits of ratio
pd.options.display.float_format = "{:,.18f}".format

ratio = calculate.geo.ratio
print(ratio.dtypes)
ratio.loc[ratio.geoid_ct2020.isin(['36005001901', '36005001902']), :]


geoid_ct2010     object
geoid_ct2020     object
ratio           float64
dtype: object


Unnamed: 0,geoid_ct2010,geoid_ct2020,ratio
4,36005001900,36005001901,0.245696400625978
5,36005001900,36005001902,0.754303599374022


In [61]:
pff_variable = 'mdvl'
geotype = 'CT20'

# Get ranges and design factor from metadata
ranges = calculate.meta.median_ranges(pff_variable)
print(ranges)
design_factor = calculate.meta.median_design_factor(pff_variable)
print(f"\nDesign factor: {design_factor}")

{'ovlu10': [0, 9999], 'ovl10t14': [10000, 14999], 'ovl15t19': [15000, 19999], 'ovl20t24': [20000, 24999], 'ovl25t29': [25000, 29999], 'ovl30t34': [30000, 34999], 'ovl35t39': [35000, 39999], 'ovl40t49': [40000, 49999], 'ovl50t59': [50000, 59999], 'ovl60t69': [60000, 69999], 'ovl70t79': [70000, 79999], 'ovl80t89': [80000, 89999], 'ovl90t99': [90000, 99999], 'ov100t124': [100000, 124999], 'ov125t149': [125000, 149999], 'ov150t174': [150000, 174999], 'ov175t199': [175000, 199999], 'ov200t249': [200000, 249999], 'ov250t299': [250000, 299999], 'ov300t399': [300000, 399999], 'ov400t499': [400000, 499999], 'ov500t749': [500000, 749999], 'ov750t999': [750000, 999999], 'ov1t149m': [1000000, 1499999], 'ov150t199m': [1500000, 1999999], 'ov2milpl': [2000000, 5000000]}

Design factor: 1.4


In [62]:
# Calculate inputs in 2020 geogs
df = calculate.calculate_e_m_multiprocessing(list(ranges.keys()), geotype)
print(df.dtypes)
print(df.loc[df.census_geoid.isin(['36005001901', '36005001902']), :])

census_geoid     object
pff_variable     object
geotype          object
e               float64
m               float64
dtype: object
  census_geoid pff_variable geotype                     e  \
4  36005001901       ovlu10    CT20  0.000000000000000000   
5  36005001902       ovlu10    CT20  0.000000000000000000   
4  36005001901     ovl10t14    CT20  0.000000000000000000   
5  36005001902     ovl10t14    CT20  0.000000000000000000   
4  36005001901     ovl15t19    CT20  0.000000000000000000   
5  36005001902     ovl15t19    CT20  0.000000000000000000   
4  36005001901     ovl20t24    CT20  0.000000000000000000   
5  36005001902     ovl20t24    CT20  0.000000000000000000   
4  36005001901     ovl25t29    CT20  0.000000000000000000   
5  36005001902     ovl25t29    CT20  0.000000000000000000   
4  36005001901     ovl30t34    CT20  0.000000000000000000   
5  36005001902     ovl30t34    CT20  0.000000000000000000   
4  36005001901     ovl35t39    CT20  0.000000000000000000   
5  360050019

In [63]:
# 3. create a pivot table with census_geoid as the index, and
# pff_variable as column names. df_pivoted.e -> the estimation dataframe
df_pivoted = df.loc[df.census_geoid.isin(['36005001901', '36005001902']), ["census_geoid", "pff_variable", "e"]].pivot(
    index="census_geoid", columns="pff_variable", values=["e"]
)
print(df_pivoted.dtypes)
print(df_pivoted)

   pff_variable
e  ov100t124       float64
   ov125t149       float64
   ov150t174       float64
   ov150t199m      float64
   ov175t199       float64
   ov1t149m        float64
   ov200t249       float64
   ov250t299       float64
   ov2milpl        float64
   ov300t399       float64
   ov400t499       float64
   ov500t749       float64
   ov750t999       float64
   ovl10t14        float64
   ovl15t19        float64
   ovl20t24        float64
   ovl25t29        float64
   ovl30t34        float64
   ovl35t39        float64
   ovl40t49        float64
   ovl50t59        float64
   ovl60t69        float64
   ovl70t79        float64
   ovl80t89        float64
   ovl90t99        float64
   ovlu10          float64
dtype: object
                                e                                            \
pff_variable            ov100t124            ov125t149            ov150t174   
census_geoid                                                                  
36005001901  0.0000000000000000

In [64]:
# Empty dataframe to store the results
results = pd.DataFrame()
results["census_geoid"] = df_pivoted.index
results["pff_variable"] = pff_variable
results["geotype"] = geotype
results

Unnamed: 0,census_geoid,pff_variable,geotype
0,36005001901,mdvl,CT20
1,36005001902,mdvl,CT20


In [73]:
def get_median(ranges, row):
    ordered = list(ranges.keys())
    N = row[ordered].sum()
    print(f"\n\nN/2: {N/2}")
    C = 0
    i = 0
    while C <= N / 2 and i <= len(ranges.keys()) - 1:
        print(f"\nRange i: {i}")
        C += int(row[ordered[i]])
        print(f"Cumulative frequency C: {C}")
        i += 1
    i = i - 1
    if i == 0:
        print("N/2 is in first range")
        median = list(ranges.values())[0][1]
        print(f"Median: {median}")
    elif C == 0:
        print("Cumulative frequency is 0")
        median = 0
        print(f"Median: {median}")
    elif i == len(ranges.keys()) - 1:
        print("N/2 is in top range")
        median = list(ranges.values())[-1][0]
        print(f"Median: {median}")
    else:
        print(f"\nN/2 is in range {i}")
        print(f"Range {i}:", ranges[ordered[i]])
        C = C - int(row[ordered[i]])
        print(f"C_i-1: {C}")
        L = ranges[ordered[i]][0]
        print(f"L_i: {L}")
        F = int(row[ordered[i]])
        print(f"F_i: {F}")
        W = ranges[ordered[i]][1] - ranges[ordered[i]][0]
        print(f"W_i: {W}")
        median = L + (N / 2 - C) * W / F
        print(f"Median: {median}")
    return median

In [74]:
# 4. calculate median estimation using get_median
results["e"] = (
    df_pivoted.e.loc[
        df_pivoted.e.index == results.census_geoid, list(ranges.keys())
    ]
    .apply(lambda row: get_median(ranges, row), axis=1)
    .to_list()
)



N/2: 14.004694835680745

Range i: 0
Cumulative frequency C: 0

Range i: 1
Cumulative frequency C: 0

Range i: 2
Cumulative frequency C: 0

Range i: 3
Cumulative frequency C: 0

Range i: 4
Cumulative frequency C: 0

Range i: 5
Cumulative frequency C: 0

Range i: 6
Cumulative frequency C: 0

Range i: 7
Cumulative frequency C: 0

Range i: 8
Cumulative frequency C: 0

Range i: 9
Cumulative frequency C: 0

Range i: 10
Cumulative frequency C: 0

Range i: 11
Cumulative frequency C: 0

Range i: 12
Cumulative frequency C: 0

Range i: 13
Cumulative frequency C: 0

Range i: 14
Cumulative frequency C: 0

Range i: 15
Cumulative frequency C: 0

Range i: 16
Cumulative frequency C: 0

Range i: 17
Cumulative frequency C: 0

Range i: 18
Cumulative frequency C: 5

Range i: 19
Cumulative frequency C: 19

N/2 is in range 19
Range 19: [300000, 399999]
C_i-1: 5
L_i: 300000
F_i: 14
W_i: 99999
Median: 364318.60563380277


N/2: 42.995305164319255

Range i: 0
Cumulative frequency C: 0

Range i: 1
Cumulative fr

In [67]:
print(results.dtypes)
results.head()

census_geoid     object
pff_variable     object
geotype          object
e               float64
dtype: object


Unnamed: 0,census_geoid,pff_variable,geotype,e
0,36005001901,mdvl,CT20,364318.6056338028
1,36005001902,mdvl,CT20,361892.9647887324


In [68]:
# Peform full calculation (including cleaning/rounding) to show display output

full_calc = calculate(pff_variable, geotype)
print(full_calc.dtypes)
print(full_calc.loc[full_calc.census_geoid.isin(['36005001901', '36005001902']),:])

census_geoid     object
labs_geoid       object
geotype          object
labs_geotype     object
pff_variable     object
c               float64
e               float64
m               float64
p               float64
z               float64
dtype: object
  census_geoid labs_geoid geotype labs_geotype pff_variable  \
4  36005001901    2001901    CT20       CT2020         mdvl   
5  36005001902    2001902    CT20       CT2020         mdvl   

                      c                          e                          m  \
4 62.200000000000002842 364,319.000000000000000000 372,638.000000000000000000   
5 18.399999999999998579 361,893.000000000000000000 109,731.000000000000000000   

    p   z  
4 nan nan  
5 nan nan  
