In [1]:
from collections import defaultdict
import geopandas as gp
from scipy.stats import wilcoxon, pearsonr
from collections import defaultdict
import pandas as pd
from src.constants import BniaIndicators
from src.exp_helper import *
from sklearn.metrics import r2_score, explained_variance_score
from IPython.display import display

In [2]:
population = pd.read_excel('data/open-baltimore/raw/VS16_Indicators_2010-2016.xlsx',sheet_name='2010')
population = population.set_index('CSA2010').drop('Baltimore City')
population = population['Total Population']

In [3]:
indicators = pd.read_excel('data/open-baltimore/raw/VS16_Indicators_2010-2016.xlsx',sheet_name='2015')
indicators = indicators.set_index('CSA2010').drop('Baltimore City')

In [4]:
income_cols = list(set(indicators.columns.tolist()) & set(BniaIndicators.household_income))

# true y event count for bnia nbh

In [5]:
train_tw = 60
verbose = 0
d_nbh = CompileData(spu_name='bnia_nbh')
d_nbh.set_x(['crime'], category_groups={'crime': [['burglary']]}, by_category=False)
d_nbh.set_y('crime/burglary')



In [6]:
def get_true_y(compile_data, eval_roller, x_setting='time_indexed_points', y_setting='event_cnt'):
    pred_res = defaultdict(dict)
    for i, dates in enumerate(eval_roller.roll()):
        past_sd, past_ed, pred_sd, pred_ed = dates
        period = 'X: %s~%s -> Y: %s~%s' % (past_sd, past_ed, pred_sd, pred_ed)
        eval_x, eval_y = data_for_fit(compile_data, x_setting=x_setting, y_setting=y_setting, dates=dates,
                                      verbose=verbose)
        pred_res[period]['true_y'] = eval_y
    return pd.DataFrame.from_dict(pred_res, 'index')


In [7]:
er_bower_2d = Rolling(rsd='2016-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=2)
nbh_true_y_2d = get_true_y(d_nbh, er_bower_2d)

In [8]:
er_bower_7d = Rolling(rsd='2016-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=7)
nbh_true_y_7d = get_true_y(d_nbh, er_bower_7d)

# evaluate the model

In [9]:
eval_res_2d = pd.read_csv('exp_res/bower_2day.csv')
pd.concat([eval_res_2d.kde200.apply(lambda x: eval(x)[0]).describe(), eval_res_2d.bower.apply(lambda x: eval(x)[0]).describe()], axis=1)

Unnamed: 0,kde200,bower
count,302.0,302.0
mean,0.566057,0.567833
std,0.082718,0.085983
min,0.266667,0.327586
25%,0.5,0.514502
50%,0.565385,0.568182
75%,0.625,0.626838
max,0.75,0.809524


In [10]:
eval_res_7d = pd.read_csv('exp_res/bower_7day.csv')
pd.concat([eval_res_7d.kde200.apply(lambda x: eval(x)[0]).describe(), eval_res_7d.bower.apply(lambda x: eval(x)[0]).describe()], axis=1)

Unnamed: 0,kde200,bower
count,297.0,297.0
mean,0.564283,0.559525
std,0.048638,0.044542
min,0.430894,0.438776
25%,0.5375,0.53125
50%,0.56391,0.556886
75%,0.594937,0.586667
max,0.713333,0.702857


# load predict result

In [11]:
pred_res_top20_hotspots_count_2d = pd.read_csv('exp_res/bower_2day_bnia_top20_hotspots.csv', index_col=0).join(nbh_true_y_2d)
pred_res_sum_risk_hotspots_2d = pd.read_csv('exp_res/bower_2day_bnia_sum_risk_hotspots.csv', index_col=0).join(nbh_true_y_2d)
pred_res_above_mean_hotspots_count_2d = pd.read_csv('exp_res/bower_2day_bnia_above_mean_hotspots.csv', index_col=0).join(nbh_true_y_2d)
pred_res_above_mean_std_hotspots_count_2d = pd.read_csv('exp_res/bower_2day_bnia_above_mean_std_hotspots.csv', index_col=0).join(nbh_true_y_2d)

In [12]:
pred_res_top20_hotspots_count_7d = pd.read_csv('exp_res/bower_7day_bnia_top20_hotspots.csv', index_col=0).join(nbh_true_y_7d)
pred_res_sum_risk_hotspots_7d = pd.read_csv('exp_res/bower_7day_bnia_sum_risk_hotspots.csv', index_col=0).join(nbh_true_y_7d)
pred_res_above_mean_hotspots_count_7d = pd.read_csv('exp_res/bower_7day_bnia_above_mean_hotspots.csv', index_col=0).join(nbh_true_y_7d)
pred_res_above_mean_std_hotspots_count_7d = pd.read_csv('exp_res/bower_7day_bnia_above_mean_std_hotspots.csv', index_col=0).join(nbh_true_y_7d)

# get pearson correlation

In [13]:
def pear_ana(res_cnt, cols):
    pearsonr_res = []
    for period, (bower, kde200, true_y) in res_cnt.iterrows():
        bower = eval(bower)
        kde200 = eval(kde200)
        for idctr_col in cols:
            idctr = indicators[idctr_col].values   
            rtrue,ptrue = pearsonr(true_y,idctr)
            rp_true = 'pear=%0.4f, p=%0.4f' % (rtrue,ptrue)
            rbower,pbower = pearsonr(bower,idctr)
            rp_bower = 'pear=%0.4f, p=%0.4f' % (rbower,pbower)
            rkde200,pkde200 = pearsonr(kde200,idctr)
            rp_kde200 = 'pear=%0.4f, p=%0.4f' % (rkde200,pkde200)
            r2true_bower = r2_score(true_y, bower)
            r2true_kde200 = r2_score(true_y, kde200)
            pear_true_bower,_ = pearsonr(true_y, bower)
            pear_true_kde200,_ = pearsonr(true_y, kde200)
            pearsonr_res.append({'period': period, 'indicator': idctr_col, 
                                 'rtrue': rtrue, 'ptrue':ptrue, 'rp_true':rp_true,
                                 'rbower': rbower,'pbower':pbower, 'rp_bower': rp_bower,
                                 'rkde200': rkde200, 'pkde200': pkde200, 'rp_kde200': rp_kde200,
                                 'r2true_bower': r2true_bower, 'r2true_kde200': r2true_kde200,
                                 'pear_true_bower': pear_true_bower, 'pear_true_kde200': pear_true_kde200
                                })
    #     break

    pearsonr_res= pd.DataFrame(pearsonr_res)[['period', 'indicator', 'rtrue', 'ptrue', 'rp_true','rbower', 'pbower', 
                                              'rp_bower', 'rkde200', 'pkde200', 'rp_kde200', 
                                              'r2true_bower', 'r2true_kde200', 'pear_true_bower', 'pear_true_kde200']]
    return pearsonr_res

In [14]:
pear_top20_hotspots_count_2d = pear_ana(top20_hotspots_count_2d,income_cols)
pear_sum_risk_hotspots_2d = pear_ana(sum_risk_hotspots_2d,income_cols)
pear_above_mean_hotspots_count_2d = pear_ana(above_mean_hotspots_count_2d,income_cols)
pear_above_mean_std_hotspots_count_2d = pear_ana(above_mean_std_hotspots_count_2d,income_cols)

NameError: name 'top20_hotspots_count_2d' is not defined

In [None]:
pear_top20_hotspots_count_7d = pear_ana(top20_hotspots_count_7d,income_cols)
pear_sum_risk_hotspots_7d = pear_ana(sum_risk_hotspots_7d,income_cols)
pear_above_mean_hotspots_count_7d = pear_ana(above_mean_hotspots_count_7d,income_cols)
pear_above_mean_std_hotspots_count_7d = pear_ana(above_mean_std_hotspots_count_7d,income_cols)

# analyze bias

In [None]:
p_thres_true = 0.05
p_thres_pred = 0.05

In [None]:
def get_res_table(pearsonr_res, cond, kind, periods):
    res = pearsonr_res[cond].pivot(index='period', columns='indicator', values='rp_'+kind) 
    return res.reindex(periods)
def mean_not_null(pearsonr_res, cond, kind, periods):
    ires = get_res_table(pearsonr_res, cond, kind, periods)
    return (~ires.isnull()).mean()

In [None]:
def get_res_table_for_4kinds(pearsonr_res, res_cnt):
    
    bower_bias_sig = (pearsonr_res.ptrue>=p_thres_true) &(pearsonr_res.pbower<p_thres_pred)
    bsig_not_null = mean_not_null(pearsonr_res, bower_bias_sig, 'bower', res_cnt.index)
    
    bower_bias_not_sig = (pearsonr_res.ptrue<p_thres_true) &(pearsonr_res.pbower>=p_thres_pred)
    bnotsig_not_null = mean_not_null(pearsonr_res, bower_bias_not_sig, 'bower', res_cnt.index)
    
    kde200_bias_sig = (pearsonr_res.ptrue>=p_thres_true) &(pearsonr_res.pkde200<p_thres_pred)
    ksig_not_null = mean_not_null(pearsonr_res, kde200_bias_sig, 'kde200', res_cnt.index)
    
    kde200_bias_not_sig = (pearsonr_res.ptrue<p_thres_true) &(pearsonr_res.pkde200>=p_thres_pred)
    knotsig_not_null = mean_not_null(pearsonr_res, kde200_bias_not_sig, 'kde200', res_cnt.index)
    
    true_sig = (pearsonr_res.ptrue<p_thres_true) 
    tsig = mean_not_null(pearsonr_res, true_sig, 'true', res_cnt.index)
    
    tmp_df = pd.concat([bsig_not_null,bnotsig_not_null, ksig_not_null, knotsig_not_null, tsig],axis=1,sort=True)
    tmp_df.columns = ['bower sig.', 'bower not sig.', 'kde200 sig.', 'kde200 not sig.', 'true sig.']
    tmp_df.index.name = 'pred sig. different than true sig. level'
    return tmp_df

In [None]:
sig_top20_hotspots_count_2d = get_res_table_for_4kinds(pear_top20_hotspots_count_2d, pred_res_top20_hotspots_count_2d)
sig_sum_risk_hotspots_2d = get_res_table_for_4kinds(pear_sum_risk_hotspots_2d, pred_res_sum_risk_hotspots_2d)
sig_above_mean_hotspots_count_2d = get_res_table_for_4kinds(pear_above_mean_hotspots_count_2d, pred_res_above_mean_hotspots_count_2d)
sig_above_mean_std_hotspots_count_2d = get_res_table_for_4kinds(pear_above_mean_std_hotspots_count_2d, pred_res_above_mean_std_hotspots_count_2d)

true_sig_pcnt = sig_top20_hotspots_count_2d['true sig.']

c = 'bower sig.'
bower_sig_pcnt_2d = pd.concat([true_sig_pcnt, 
                 sig_top20_hotspots_count_2d[c], 
                 sig_above_mean_hotspots_count_2d[c], 
                 sig_above_mean_std_hotspots_count_2d[c],
                 sig_sum_risk_hotspots_2d[c], 
                ],
          axis=1)
bower_sig_pcnt_2d.columns = ['true sig.','top20', '>mean', '>mean+std', 'sum_risk', ]
bower_sig_pcnt_2d = bower_sig_pcnt_2d.join(bower_sig_pcnt_2d.divide(true_sig_pcnt, axis=0),rsuffix='/true')
print('=====================',c)
display(bower_sig_pcnt_2d)

c = 'kde200 sig.'
kde200_sig_pcnt_2d = pd.concat([true_sig_pcnt, 
                 sig_top20_hotspots_count_2d[c], 
                 sig_above_mean_hotspots_count_2d[c], 
                 sig_above_mean_std_hotspots_count_2d[c],
                 sig_sum_risk_hotspots_2d[c], 
                ],
          axis=1)
kde200_sig_pcnt_2d.columns = ['true sig.','top20', '>mean', '>mean+std', 'sum_risk', ]
kde200_sig_pcnt_2d=kde200_sig_pcnt_2d.join(kde200_sig_pcnt_2d.divide(true_sig_pcnt, axis=0),rsuffix='/true')
print('=====================',c)
display(kde200_sig_pcnt_2d)

In [None]:
sig_top20_hotspots_count_7d = get_res_table_for_4kinds(pear_top20_hotspots_count_7d, pred_res_top20_hotspots_count_7d)
sig_sum_risk_hotspots_7d = get_res_table_for_4kinds(pear_sum_risk_hotspots_7d, pred_res_sum_risk_hotspots_7d)
sig_above_mean_hotspots_count_7d = get_res_table_for_4kinds(pear_above_mean_hotspots_count_7d, pred_res_above_mean_hotspots_count_7d)
sig_above_mean_std_hotspots_count_7d = get_res_table_for_4kinds(pear_above_mean_std_hotspots_count_7d, pred_res_above_mean_std_hotspots_count_7d)

true_sig_pcnt = sig_top20_hotspots_count_7d['true sig.']

c = 'bower sig.'
bower_sig_pcnt_7d = pd.concat([true_sig_pcnt, 
                 sig_top20_hotspots_count_7d[c], 
                 sig_above_mean_hotspots_count_7d[c], 
                 sig_above_mean_std_hotspots_count_7d[c],
                 sig_sum_risk_hotspots_7d[c], 
                ],
          axis=1)
bower_sig_pcnt_7d.columns = ['true sig.','top20', '>mean', '>mean+std', 'sum_risk', ]
bower_sig_pcnt_7d = bower_sig_pcnt_7d.join(bower_sig_pcnt_7d.divide(true_sig_pcnt, axis=0),rsuffix='/true')
print('=====================',c)
display(bower_sig_pcnt_7d)

c = 'kde200 sig.'
kde200_sig_pcnt_7d = pd.concat([true_sig_pcnt, 
                 sig_top20_hotspots_count_7d[c], 
                 sig_above_mean_hotspots_count_7d[c], 
                 sig_above_mean_std_hotspots_count_7d[c],
                 sig_sum_risk_hotspots_7d[c], 
                ],
          axis=1)
kde200_sig_pcnt_7d.columns = ['true sig.','top20', '>mean', '>mean+std', 'sum_risk', ]
kde200_sig_pcnt_7d=kde200_sig_pcnt_7d.join(kde200_sig_pcnt_7d.divide(true_sig_pcnt, axis=0),rsuffix='/true')
print('=====================',c)
display(kde200_sig_pcnt_7d)