In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.exp_helper import *

In [3]:
from src.model.bsln_kde import KDE
from src.model.bsln_bower import Bower
from collections import defaultdict
import numpy as np
from src.utils.metric_single_num import metrics, hit_rate, search_efficient_rate, prediction_accuracy_index

In [4]:
import src.utils.spatial_unit

from src.utils.spatial_unit import *

In [5]:
grid_size=50
train_tw = 60
verbose=0

In [6]:
d_bower = CompileData(spu_name='grid_%d' % grid_size)
d_bower.set_x(['crime'], category_groups={'crime':[['burglary']]}, by_category=False)
d_bower.set_y('crime/burglary')



In [7]:
tr_bower_2 = Rolling(rsd='2015-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=2)
er_bower_2 = Rolling(rsd='2016-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=2)

tr_bower_7 = Rolling(rsd='2015-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=7)
er_bower_7 = Rolling(rsd='2016-07-01', red='2017-06-30', rstep=1, tw_past=train_tw, tw_pred=7)

In [8]:
bower = Bower(grid_size, bw=400, tw=train_tw, verbose=verbose)
kde200 = KDE(bw=200, tw=train_tw, verbose=verbose)

In [9]:
evaluators = [hit_rate, search_efficient_rate, prediction_accuracy_index]

In [41]:
def get_pred(compile_data, train_roller, eval_roller, kde, bower, refit=False, 
             x_setting='time_indexed_points', y_setting='event_cnt', verbose=0, debug=False):
    grid_centers = compile_data.spu.Cen_coords.apply(lambda x: eval(x))
    
    tmp_train_roller = copy.copy(train_roller)
    
    pred_res = defaultdict(dict)
    
    for i, dates in enumerate(eval_roller.roll()):
        past_sd, past_ed, pred_sd, pred_ed = dates
        period = 'X: %s~%s -> Y: %s~%s' % (past_sd, past_ed, pred_sd, pred_ed)
        
#         if refit:
#             if verbose>1:
#                 print('refitting for evaluate period:', period)
#             tmp_train_roller.red = past_sd
#             train_x, train_y = data_for_fit(compile_data, roller=tmp_train_roller, x_setting=x_setting, y_setting=y_setting,
#                                 stack_roll=False, verbose=verbose)
#             kde.fit(train_x)
#             bower.fit(train_x)
#             if verbose>1:
#                 print('model fit')
                
        eval_x, eval_y = data_for_fit(compile_data, x_setting=x_setting, y_setting=y_setting, dates=dates,
                                      verbose=verbose)
        
        pred_res[period]['true_y'] = eval_y
        
        if verbose>0:
            print('fitting kde, bower on X: %s~%s' % (past_sd, past_ed))
        kde200.fit(eval_x)
        bower.fit(eval_x)
        
        if verbose>1:
            print('predicting for each grid_center')
            
        pred_res[period]['kde200']  = kde.predict(grid_centers).values
        pred_res[period]['bower']  = bower.predict(grid_centers).values

 
        if (i+1) % 5 == 0:
            print('%d periods are done' % i)
        if i == 2 and debug:
            break
    return pd.DataFrame.from_dict(pred_res, 'index')

In [11]:
def get_eval(pred_res):
    eval_res = defaultdict(lambda: defaultdict(list))
    for idx, row in pred_res.iterrows():
        true_y = row.true_y
        for mname, pred_y in row.drop('true_y').items():
            hotspot_mask = pred_y>np.percentile(pred_y, 80)
            for e in evaluators:
                score = e(true_y, hotspot_mask, d_bower.spu)
                print(mname, e.__name__, score)
                eval_res[mname][idx].append(score)
    return eval_res

In [12]:
def bnia_stats(pred_res, data, xday):
    top20 = lambda x: x>np.percentile(x,80)
    above_mean = lambda x: x>x.mean()
    above_mean_std = lambda x: x> (x.mean()+x.std())
    for name, stats_func in [('above_mean', above_mean), ('top20', top20), ('above_mean_std', above_mean_std), ]:
        res = []
        for idx, row in pred_res.iterrows():
            kde_stats = stats_func(row.kde200)
            kde_stats = pd.Series(kde_stats, index=data.spu.index, name='stat')
            kde_stats = grid2nbh(kde_stats).values.tolist()
            bower_stats = stats_func(row.bower)
            bower_stats = pd.Series(bower_stats, index=data.spu.index, name='stat')
            bower_stats = grid2nbh(bower_stats).tolist()
            res.append({'period': idx, 'kde200': kde_stats, 'bower': bower_stats})
        res = pd.DataFrame(res).set_index('period')
        res.to_csv('exp_res/bower_%dday_bnia_%s_hotspots.csv' % (xday,name))

In [42]:
pred_res_2d = get_pred(d_bower, tr_bower_2, er_bower_2, kde200, bower, refit=True, 
         x_setting='time_indexed_points', y_setting='event_cnt', debug=True, verbose=0)
eval_res_2d = get_eval(pred_res_2d)

kde200 hit_rate 0.7
kde200 search_efficient_rate 0.5836068990672711
kde200 prediction_accuracy_index 3.5000364754311915
bower hit_rate 0.7
bower search_efficient_rate 0.5836068990672711
bower prediction_accuracy_index 3.5000364754311915
kde200 hit_rate 0.5581395348837209
kde200 search_efficient_rate 0.5002344849148038
kde200 prediction_accuracy_index 2.79072675781889
bower hit_rate 0.5116279069767442
bower search_efficient_rate 0.45854827783857016
bower prediction_accuracy_index 2.5581661946673164
kde200 hit_rate 0.6170212765957447
kde200 search_efficient_rate 0.604450002605388
kde200 prediction_accuracy_index 3.0851385345746065
bower hit_rate 0.5957446808510638
bower search_efficient_rate 0.5845511482254697
bower prediction_accuracy_index 2.983573935059743


In [55]:

pd.DataFrame.from_dict(pred_res_2d['bower'].to_dict()).to_sparse(fill_value=0).to_pickle('exp_res/bower_2day_raw_bower.pickle')
pd.DataFrame.from_dict(pred_res_2d['true_y'].to_dict()).to_sparse(fill_value=0).to_pickle('exp_res/bower_2day_raw_y.pickle')
pd.DataFrame.from_dict(pred_res_2d['kde200'].to_dict()).to_pickle('exp_res/bower_2day_raw_kde200.pickle')

In [53]:
pred_res_7d = get_pred(d_bower, tr_bower_7, er_bower_7, kde200, bower, refit=True, 
         x_setting='time_indexed_points', y_setting='event_cnt', debug=True, verbose=0)
eval_res_7d = get_eval(pred_res_7d)

kde200 hit_rate 0.6691176470588235
kde200 search_efficient_rate 1.8967224219686312
kde200 prediction_accuracy_index 3.3456231015151094
bower hit_rate 0.6617647058823529
bower search_efficient_rate 1.8758793184305143
bower prediction_accuracy_index 3.308858012487471
kde200 hit_rate 0.6774193548387096
kde200 search_efficient_rate 1.7508206972018134
kde200 prediction_accuracy_index 3.387132072997927
bower hit_rate 0.6370967741935484
bower search_efficient_rate 1.6466051795112293
bower prediction_accuracy_index 3.1855170686528127
kde200 hit_rate 0.65625
kde200 search_efficient_rate 1.7508206972018134
kde200 prediction_accuracy_index 3.2812841957167422
bower hit_rate 0.65625
bower search_efficient_rate 1.7536534446764092
bower prediction_accuracy_index 3.286593162839248


In [54]:

pd.DataFrame.from_dict(pred_res_7d['bower'].to_dict()).to_sparse(fill_value=0).to_pickle('exp_res/bower_7day_raw_bower.pickle')
pd.DataFrame.from_dict(pred_res_7d['true_y'].to_dict()).to_sparse(fill_value=0).to_pickle('exp_res/bower_7day_raw_y.pickle')
pd.DataFrame.from_dict(pred_res_7d['kde200'].to_dict()).to_pickle('exp_res/bower_7day_raw_kde200.pickle')

In [26]:
(pred_res_2d.bower.values[0]==pred_res_7d.bower.values[0]).mean()

1.0

In [24]:
pred_res_7d

Unnamed: 0,true_y,kde200,bower
X: 2016-07-01 00:00:00~2016-08-29 23:59:59 -> Y: 2016-08-30 00:00:00~2016-09-05 23:59:59,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[8.582278140477092e-99, 1.7240350494101372e-10...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [17]:
# pd.DataFrame(eval_res_2d).to_csv('tmp/exp_res/bower_2day_eval.csv')
# pd.DataFrame(eval_res_7d).to_csv('tmp/exp_res/bower_7day_eval.csv')