In [None]:
"""
# Grouped Median Income Tabulation
Last Updated: 11/09/22
@author: Michael Ma
"""

import pandas as pd
import sqlalchemy as sql
import pymssql
import numpy as np

# Demographic Data from SQL Server
# Get the demographic data from the database (Household Income)
ddam = sql.create_engine('mssql+pymssql://xxxxxxx/')
income = pd.read_sql(
"""   
SELECT d.mgra, g.income_group_id ,g.name AS income, SUM(i.households) AS households
FROM demographic_warehouse.fact.household_income i 
JOIN demographic_warehouse.dim.income_group g ON i.income_group_id = g.income_group_id
JOIN demographic_warehouse.dim.mgra_denormalize d ON i.mgra_id = d.mgra_id
WHERE datasource_id = 45 AND yr_id = 2021
GROUP BY d.mgra, g.name, g.income_group_id
ORDER BY d.mgra, g.income_group_id
""",con=xxxx.connect())
# display(income)


def income_group_lower_bound(x):
    if x == 'Less than $15,000':
        return 0
    if x == '$15,000 to $29,999':
        return 15000
    if x == '$30,000 to $44,999':
        return 30000
    if x == '$45,000 to $59,999':
        return 45000
    if x == '$60,000 to $74,999':
        return 60000
    if x == '$75,000 to $99,999':
        return 75000
    if x == '$100,000 to $124,999':
        return 100000
    if x == '$125,000 to $149,999':
        return 125000
    if x == '$150,000 to $199,999':
        return 150000
    if x == '$200,000 or more':
        return 200000
    
income['lower_bound'] = income['income'].apply(income_group_lower_bound)

def income_group_upper_bound(x):
    if x == 'Less than $15,000':
        return 15000
    if x == '$15,000 to $29,999':
        return 29999
    if x == '$30,000 to $44,999':
        return 44999
    if x == '$45,000 to $59,999':
        return 59999
    if x == '$60,000 to $74,999':
        return 74999
    if x == '$75,000 to $99,999':
        return 99999
    if x == '$100,000 to $124,999':
        return 124999
    if x == '$125,000 to $149,999':
        return 149999
    if x == '$150,000 to $199,999':
        return 199999
    if x == '$200,000 or more':
        return 200000
    
income['upper_bound'] = income['income'].apply(income_group_upper_bound)


# Median Income for Grouped Data (Source: https://www.statology.org/median-of-grouped-data/)
'''
L: Lower limit of median class
W: Width of median class
N: Total Frequency
C: Cumulative frequency up to median class
F: Frequency of median class

Median = L + W[(N/2 - C) / F]
'''

# Median Income Function (For differnt geo)
def MedianIncome(Table, Geo):

    newtable = []

    for y in Table[Geo].unique():
        x = Table[Table[Geo] == y]
        x['cumulative_households'] = x['households'].cumsum(axis=0)
        x_median_income_class = x[x['cumulative_households'] > x['households'].sum()/2].iloc[[0]]
        x_precedingcumulative = x[x['cumulative_households'] < x['households'].sum()/2].iloc[[-1]]
        
        lower_limit = x_median_income_class.iloc[0]['lower_bound']
        width = (x_median_income_class.iloc[0]['upper_bound'] - x_median_income_class.iloc[0]['lower_bound'])
        N = x['households'].sum()
        C = x_precedingcumulative.iloc[0]['cumulative_households']
        F = x_median_income_class.iloc[0]['households']
        
        newtable.append([y, lower_limit + (width * ((N/2 - C) / F))])

    return pd.DataFrame(newtable, columns=['Geography','Median_Income'])