In [1]:
from collections import defaultdict
import geopandas as gp
from scipy.stats import wilcoxon, pearsonr, spearmanr, kendalltau
from collections import defaultdict
import pandas as pd
from src.constants import BniaIndicators
from src.exp_helper import *
from sklearn.metrics import r2_score, explained_variance_score
from IPython.display import display

In [2]:
POP = pd.read_excel('data/open-baltimore/raw/VS16_Indicators_2010-2016.xlsx',sheet_name='2010')
POP = POP.set_index('CSA2010').drop('Baltimore City')
POP = POP['Total Population']
IDCTR = pd.read_excel('data/open-baltimore/raw/VS16_Indicators_2010-2016.xlsx',sheet_name='2015')
IDCTR = IDCTR.set_index('CSA2010').drop('Baltimore City')

In [3]:
income_cols = [col for col in BniaIndicators.household_income if col in set(IDCTR.columns.tolist())]
housing_cols = [col for col in BniaIndicators.housing if col in set(IDCTR.columns.tolist())]

In [4]:
def norm_by_population(df):
    for p in df.index:
#         print(p)
        for c in ['bower','kde200']:
            arr = df.loc[p,c]
            df.loc[p,c]=arr/(POP.values)


In [5]:
def iter_keys():
    for xday in [2, 7][:1]:
        for fpn in ['fp','fn']:
            for hname in ['top20', 'above_mean', 'above_mean_std']:
                for normed in [True, False]:
                    yield xday, fpn, hname, normed
def keys2name(xday, fpn, hname, normed):
    s = '%dd-%s-%s' % (xday, fpn, hname)
    if normed:
        return s+'-normed'
    return s + '-raw'

In [7]:
# pred_res[xday][fpn][hname][normed]
pred_res = defaultdict(lambda: defaultdict(lambda : defaultdict(dict)))
for xday, fpn, hname, normed in iter_keys():
    df = pd.read_csv('exp_res/bower_%dday_bnia_%s_hotspots_%s.csv' % (xday, hname, fpn), index_col=0).applymap(eval)
    if normed: 
        norm_by_population(df)
    pred_res[xday][fpn][hname][normed]=df

In [8]:
def statistical_test(pred_df, test, idctr_cols):
    stest = []
    for period, (bower, kde200) in pred_df.iterrows():
        for icol in idctr_cols:
            res = {'period': period, 'indicator': icol}
            idctr = IDCTR[icol].values
            # bower
            r_bower, p_bower = test(bower, idctr)
            res['r_bower'] = r_bower
            res['p_bower'] = p_bower
            res['rp_bower'] = 'r=%0.4f, p=%0.4f' % (r_bower, p_bower)
            # kde 
            r_kde200, p_kde200 = test(kde200, idctr)
            res['r_kde200'] = r_kde200
            res['p_kde200'] = p_kde200
            res['rp_kde200'] = 'r=%0.4f, p=%0.4f' % (r_kde200, p_kde200)
            stest.append(res)
    
    return pd.DataFrame(stest)

In [9]:
def keep_sig_per_period(df, sig_lvl, mname, periods):
    pivoted = df[df['p_'+mname]<sig_lvl].pivot(index='period', columns='indicator', values='rp_'+mname) 
    return pivoted.reindex(periods)
def sig_pcnt_periods(df, sig_lvl, mname, periods):
    ires = keep_sig_per_period(df, sig_lvl, mname, periods)
    return (~ires.isnull()).mean()

In [10]:
all_periods = {2: pred_res[2]['fn']['above_mean_std'][False].index}

In [11]:
test_choices = [(pearsonr,'pearsonr'), (spearmanr,'spearmanr'), (kendalltau,'kendalltau')]    

In [12]:
sum_table_each_test = {}
for test, tname in test_choices:
    period_idctr_stest_rp = defaultdict(lambda: defaultdict(lambda : defaultdict(dict)))
    for xday, fpn, hname, normed in iter_keys():
        df = pred_res[xday][fpn][hname][normed]
        period_idctr_stest_rp[xday][fpn][hname][normed]= statistical_test(df, test, income_cols)
    sig_pcnt_of_each_method = []
    for mname in ['bower', 'kde200']:
        for xday, fpn, hname, normed in iter_keys():
            stest = period_idctr_stest_rp[xday][fpn][hname][normed]
            sig_pcnt = sig_pcnt_periods(stest, 0.05, 'bower', all_periods[xday])
            sig_pcnt.name = '%s-%s' % (mname, keys2name(xday, fpn, hname, normed))
            sig_pcnt_of_each_method.append(sig_pcnt)
    sum_table =pd.DataFrame(sig_pcnt_of_each_method).T.reindex(income_cols).fillna(0)
    sum_table.index = [col.replace('$','\$') for col in sum_table.index]
    tuples = [col.split('-') for col in sum_table.columns]
    index = pd.MultiIndex.from_tuples(tuples)
    sum_table.columns = index
    sum_table_each_test[tname] = sum_table

In [18]:
tname = 'kendalltau'  # pearsonr spearmanr kendalltau
sum_table = sum_table_each_test[tname]
for mname in ('bower','kde200'):
    for xday in ('2d',):
        print()
        tmp_df = sum_table.loc[:, (mname, xday)].copy()
        tmp_df.index.name = '%s-%s-%s' % (tname, mname, xday)
        display(tmp_df)
        print('    ')
        print('---------------------')




Unnamed: 0_level_0,fp,fp,fp,fp,fp,fp,fn,fn,fn,fn,fn,fn
Unnamed: 0_level_1,top20,top20,above_mean,above_mean,above_mean_std,above_mean_std,top20,top20,above_mean,above_mean,above_mean_std,above_mean_std
Unnamed: 0_level_2,normed,raw,normed,raw,normed,raw,normed,raw,normed,raw,normed,raw
kendalltau-bower-2d,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Median Household Income,0.069536,0.049669,0.145695,0.069536,0.02649,0.019868,0.046358,0.046358,0.049669,0.039735,0.049669,0.039735
"Percent of Households Earning Less than \$25,000",0.195364,0.07947,0.264901,0.039735,0.07947,0.039735,0.056291,0.059603,0.056291,0.05298,0.059603,0.056291
"Percent of Households Earning \$25,000 to \$40,000",0.009934,0.05298,0.003311,0.072848,0.029801,0.019868,0.049669,0.043046,0.05298,0.05298,0.066225,0.059603
"Percent of Households Earning \$40,000 to \$60,000",0.0,0.0,0.0,0.006623,0.006623,0.0,0.046358,0.049669,0.049669,0.046358,0.043046,0.046358
"Percent of Households Earning \$60,000 to \$75,000",0.109272,0.006623,0.069536,0.0,0.036424,0.009934,0.029801,0.02649,0.046358,0.046358,0.033113,0.023179
"Percent of Households Earning More than \$75,000",0.145695,0.178808,0.331126,0.357616,0.023179,0.033113,0.046358,0.029801,0.059603,0.049669,0.056291,0.056291
Percent of Family Households Living Below the Poverty Line,0.516556,0.417219,0.55298,0.417219,0.31457,0.301325,0.076159,0.062914,0.066225,0.059603,0.092715,0.07947
Percent of Children Living Below the Poverty Line,0.582781,0.460265,0.65894,0.486755,0.354305,0.258278,0.082781,0.072848,0.072848,0.062914,0.089404,0.076159


    
---------------------



Unnamed: 0_level_0,fp,fp,fp,fp,fp,fp,fn,fn,fn,fn,fn,fn
Unnamed: 0_level_1,top20,top20,above_mean,above_mean,above_mean_std,above_mean_std,top20,top20,above_mean,above_mean,above_mean_std,above_mean_std
Unnamed: 0_level_2,normed,raw,normed,raw,normed,raw,normed,raw,normed,raw,normed,raw
kendalltau-kde200-2d,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Median Household Income,0.069536,0.049669,0.145695,0.069536,0.02649,0.019868,0.046358,0.046358,0.049669,0.039735,0.049669,0.039735
"Percent of Households Earning Less than \$25,000",0.195364,0.07947,0.264901,0.039735,0.07947,0.039735,0.056291,0.059603,0.056291,0.05298,0.059603,0.056291
"Percent of Households Earning \$25,000 to \$40,000",0.009934,0.05298,0.003311,0.072848,0.029801,0.019868,0.049669,0.043046,0.05298,0.05298,0.066225,0.059603
"Percent of Households Earning \$40,000 to \$60,000",0.0,0.0,0.0,0.006623,0.006623,0.0,0.046358,0.049669,0.049669,0.046358,0.043046,0.046358
"Percent of Households Earning \$60,000 to \$75,000",0.109272,0.006623,0.069536,0.0,0.036424,0.009934,0.029801,0.02649,0.046358,0.046358,0.033113,0.023179
"Percent of Households Earning More than \$75,000",0.145695,0.178808,0.331126,0.357616,0.023179,0.033113,0.046358,0.029801,0.059603,0.049669,0.056291,0.056291
Percent of Family Households Living Below the Poverty Line,0.516556,0.417219,0.55298,0.417219,0.31457,0.301325,0.076159,0.062914,0.066225,0.059603,0.092715,0.07947
Percent of Children Living Below the Poverty Line,0.582781,0.460265,0.65894,0.486755,0.354305,0.258278,0.082781,0.072848,0.072848,0.062914,0.089404,0.076159


    
---------------------
