In [101]:
from collections import defaultdict
import geopandas as gp
from scipy.stats import wilcoxon, pearsonr
from collections import defaultdict
import pandas as pd
from src.constants import BniaIndicators
from src.exp_helper import *
from sklearn.metrics import r2_score

In [79]:
population = pd.read_excel('data/open-baltimore/raw/VS16_Indicators_2010-2016.xlsx',sheet_name='2010')
population = population.set_index('CSA2010').drop('Baltimore City')
population = population['Total Population']

In [5]:
indicators = pd.read_excel('data/open-baltimore/raw/VS16_Indicators_2010-2016.xlsx',sheet_name='2015')
indicators = indicators.set_index('CSA2010').drop('Baltimore City')

In [55]:
train_tw = 60
verbose = 0
d_nbh = CompileData(spu_name='bnia_nbh')
d_nbh.set_x(['crime'], category_groups={'crime': [['burglary']]}, by_category=False)
d_nbh.set_y('crime/burglary')



In [58]:
er_bower_2 = Rolling(rsd='2016-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=2)
def get_true_y(compile_data, eval_roller, x_setting='time_indexed_points', y_setting='event_cnt'):
    pred_res = defaultdict(dict)
    for i, dates in enumerate(eval_roller.roll()):
        past_sd, past_ed, pred_sd, pred_ed = dates
        period = 'X: %s~%s -> Y: %s~%s' % (past_sd, past_ed, pred_sd, pred_ed)
        eval_x, eval_y = data_for_fit(compile_data, x_setting=x_setting, y_setting=y_setting, dates=dates,
                                      verbose=verbose)
        pred_res[period]['true_y'] = eval_y
    return pd.DataFrame.from_dict(pred_res, 'index')
nbh_true_y = get_true_y(d_nbh, er_bower_2)

In [26]:
cols = list(set(indicators.columns.tolist()) & set(BniaIndicators.household_income))

In [74]:
for col in cols:
    print(col,indicators[cols].shape)

Percent of Households Earning Less than $25,000 (55, 8)
Percent of Family Households Living Below the Poverty Line (55, 8)
Percent of Households Earning More than $75,000 (55, 8)
Median Household Income (55, 8)
Percent of Households Earning $25,000 to $40,000 (55, 8)
Percent of Households Earning $60,000 to $75,000 (55, 8)
Percent of Children Living Below the Poverty Line (55, 8)
Percent of Households Earning $40,000 to $60,000 (55, 8)


In [37]:
eval_res_2d = pd.read_csv('exp_res/bower_2day.csv')
pd.concat([eval_res_2d.kde200.apply(lambda x: eval(x)[0]).describe(), eval_res_2d.bower.apply(lambda x: eval(x)[0]).describe()], axis=1)

In [63]:
res_cnt = pd.read_csv('exp_res/bower_2day_bnia_top20_hotspots.csv', index_col=0).join(nbh_true_y)

In [117]:
pearsonr_res = []
for period, (bower, kde200, true_y) in res_cnt.iterrows():
    bower = eval(bower)
    kde200 = eval(kde200)
    for idctr_col in cols:
        idctr = indicators[idctr_col].values   
        rtrue,ptrue = pearsonr(true_y,idctr)
        rbower,pbower = pearsonr(bower,idctr)
        rp_bower = 'pear=%0.4f, p=%0.4f' % (rbower,pbower)
        rkde200,pkde200 = pearsonr(kde200,idctr)
        rp_kde200 = 'pear=%0.4f, p=%0.4f' % (rkde200,pkde200)
        r2true_bower = r2_score(true_y, bower)
        r2true_kde200 = r2_score(true_y, kde200)
        pearsonr_res.append({'period': period, 'indicator': idctr_col, 
                             'rtrue': rtrue, 'ptrue':ptrue, 
                             'rbower': rbower,'pbower':pbower, 'rp_bower': rp_bower,
                             'rkde200': rkde200, 'pkde200': pkde200, 'rp_kde200': rp_kde200,
                             'r2true_bower': r2true_bower, 'r2true_kde200': r2true_kde200
                            })
#     break

pearsonr_res= pd.DataFrame(pearsonr_res)[['period', 'indicator', 'rtrue', 'ptrue', 'rbower', 'pbower', 'rp_bower', 'rkde200', 'pkde200', 'rp_kde200', 'r2true_bower', 'r2true_kde200']]


In [132]:
p_thres_true = 0.05
p_thres_pred = 0.05

In [159]:
def get_res_table(pearsonr_res, cond, kind, periods):
    res = pearsonr_res[cond].pivot(index='period', columns='indicator', values='rp_'+kind) 
    return res.reindex(periods)

In [160]:
bower_bias_sig = (pearsonr_res.ptrue>=p_thres_true) &(pearsonr_res.pbower<p_thres_pred)
ires = get_res_table(pearsonr_res, bower_bias_sig, 'bower', res_cnt.index)
(~ires.isnull()).mean()

indicator
Median Household Income                                       0.122517
Percent of Children Living Below the Poverty Line             0.539735
Percent of Family Households Living Below the Poverty Line    0.456954
Percent of Households Earning $25,000 to $40,000              0.066225
Percent of Households Earning $40,000 to $60,000              0.039735
Percent of Households Earning Less than $25,000               0.115894
Percent of Households Earning More than $75,000               0.168874
dtype: float64

In [161]:
bower_bias_not_sig = (pearsonr_res.ptrue<p_thres) &(pearsonr_res.pbower>=p_thres)
ires = get_res_table(pearsonr_res, bower_bias_not_sig, 'bower', res_cnt.index)
(~ires.isnull()).mean()

indicator
Median Household Income                                       0.003311
Percent of Family Households Living Below the Poverty Line    0.009934
Percent of Households Earning $25,000 to $40,000              0.006623
Percent of Households Earning $40,000 to $60,000              0.006623
Percent of Households Earning $60,000 to $75,000              0.003311
Percent of Households Earning Less than $25,000               0.006623
Percent of Households Earning More than $75,000               0.003311
dtype: float64

In [162]:
kde200_bias_sig = (pearsonr_res.ptrue>=p_thres_true) &(pearsonr_res.pkde200<p_thres_pred)
ires = get_res_table(pearsonr_res, kde200_bias_sig, 'kde200', res_cnt.index)
(~ires.isnull()).mean()


indicator
Median Household Income                                       0.135762
Percent of Children Living Below the Poverty Line             0.668874
Percent of Family Households Living Below the Poverty Line    0.523179
Percent of Households Earning $25,000 to $40,000              0.086093
Percent of Households Earning $40,000 to $60,000              0.009934
Percent of Households Earning Less than $25,000               0.215232
Percent of Households Earning More than $75,000               0.152318
dtype: float64

In [163]:

kde200_bias_not_sig = (pearsonr_res.ptrue<p_thres) &(pearsonr_res.pkde200>=p_thres)
ires = get_res_table(pearsonr_res, kde200_bias_not_sig, 'kde200', res_cnt.index)
(~ires.isnull()).mean()

indicator
Median Household Income                                       0.003311
Percent of Family Households Living Below the Poverty Line    0.003311
Percent of Households Earning $25,000 to $40,000              0.006623
Percent of Households Earning $40,000 to $60,000              0.006623
Percent of Households Earning $60,000 to $75,000              0.003311
Percent of Households Earning Less than $25,000               0.006623
Percent of Households Earning More than $75,000               0.003311
dtype: float64