In [1]:
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm 
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()
import multiprocessing
nCPU = multiprocessing.cpu_count()
from datetime import datetime, timedelta

  import pandas.util.testing as tm


In [2]:
ct_neighbor = pd.read_csv('CensusTractsNeighborhood.csv')
ct_neighbor = ct_neighbor.groupby('src_boro_c').nbr_boro_c.apply(list)
ct_neighbor = ct_neighbor.to_frame()
ct_neighbor.reset_index(inplace=True, drop=False)
ct_neighbor

Unnamed: 0,src_boro_c,nbr_boro_c
0,1000201,"[1000202, 1000600, 1001401]"
1,1000202,"[1000201, 1001001, 1001200, 1001401, 3002100]"
2,1000500,"[3000700, 3004700, 3005300]"
3,1000600,"[1000201, 1000800, 1001401, 1001600]"
4,1000700,"[1000900, 1001300, 1001502, 3000100, 3000301]"
...,...,...
2157,5030301,"[5023900, 5024700, 5025100, 5029102, 5029104, ..."
2158,5030302,"[5023100, 5023900, 5030301, 5031901, 5032300]"
2159,5031901,"[5022300, 5023100, 5030302, 5031902, 5032300]"
2160,5031902,"[5022300, 5023100, 5031901, 5032300]"


In [3]:
def add_dt_1(data):
    return datetime.strptime(str(data.stop_frisk1_XYTableToPoint_S_2) + ' ' + str(data.stop_frisk1_XYTableToPoint_S_3), '%Y-%m-%d %H:%M:%S')
        
frisk = pd.read_csv('stop_frisk_census_near_raw.csv')
frisk.dropna(inplace=True, how='any')
frisk.drop_duplicates(subset=['stop_frisk1_XYTableToPoint_S_2', 'stop_frisk1_XYTableToPoint_S_3', 'geo_export_267875c8_5ee9_4cb_13'], inplace=True)
frisk['dt'] = frisk.apply(add_dt_1, axis=1)
frisk['ct'] = frisk['geo_export_267875c8_5ee9_4cb_13'].apply(lambda x: int(x))
frisk.drop(['stop_frisk1_XYTableToPoint_S_2', 'stop_frisk1_XYTableToPoint_S_3', 'OBJECTID', 'geo_export_267875c8_5ee9_4cb_13'], axis='columns', inplace=True)
frisk

Unnamed: 0,dt,ct
0,2017-01-16 14:26:00,1003300
2,2017-02-08 11:10:00,1001300
3,2017-02-20 11:35:00,1004900
4,2017-02-21 13:20:00,1004900
5,2017-02-17 21:25:00,4142900
...,...,...
11614,2017-10-10 17:40:00,1016400
11615,2017-10-12 16:05:00,1016400
11616,2017-01-03 19:41:00,2021700
11617,2017-01-08 23:10:00,3034800


In [4]:
def add_dt_2(data):
    return datetime.strptime(str(data.Date) + ' ' + str(data.Time), '%m/%d/%Y %H:%M:%S')

crimes = pd.concat(([pd.read_csv('personal_near_sorted.csv'), pd.read_csv('property_near_sorted.csv')]))
crimes.reset_index(inplace=True, drop=True)
crimes.dropna(inplace=True, how='any')
crimes.isnull().sum()
crimes['dt'] = crimes.apply(add_dt_2, axis=1)
crimes

Unnamed: 0,boro_ct2010,Date,Time,Type,dt
0,4011200,1/1/2017,0:01:00,PersonalCrime,2017-01-01 00:01:00
1,2014100,1/1/2017,0:01:00,PersonalCrime,2017-01-01 00:01:00
2,2004600,1/1/2017,0:01:00,PersonalCrime,2017-01-01 00:01:00
3,4033402,1/1/2017,0:01:00,PersonalCrime,2017-01-01 00:01:00
4,3038700,1/1/2017,18:00:00,PersonalCrime,2017-01-01 18:00:00
...,...,...,...,...,...
141082,2042400,6/2/2017,15:45:00,PropertyCrime,2017-06-02 15:45:00
141083,3051700,8/11/2017,20:15:00,PropertyCrime,2017-08-11 20:15:00
141084,1031703,9/26/2017,12:17:00,PropertyCrime,2017-09-26 12:17:00
141085,3052700,2/15/2017,16:00:00,PropertyCrime,2017-02-15 16:00:00


In [5]:
def get_frisk_counts__id(_dt, __ct):
    df_ct = frisk.query('ct == @__ct')
    c = 0
    for i in df_ct.itertuples():
        td_diff = (_dt - i.dt).total_seconds()
        if 0 < td_diff and td_diff < 21600: # 6hrs
            c = c + 1
    return c

def get_frisk_counts__neighbors(_dt, _ct):
    if ct_neighbor.query('src_boro_c == @_ct').empty:
        return 0
    else:
        neighbors = ct_neighbor.query('src_boro_c == @_ct').iloc[0, 1]
        c_sum = 0
        for neighbor in neighbors:
            c_sum = c_sum + get_frisk_counts__id(_dt, neighbor)
        return c_sum

counts_ct_list = []
counts_neighbors_list = []
counts_total_list = []
for crime in tqdm(crimes.itertuples()):
    counts_ct = get_frisk_counts__id(crime.dt, crime.boro_ct2010)
    counts_neighbors = get_frisk_counts__neighbors(crime.dt, crime.boro_ct2010)
    counts_ct_list.append(counts_ct)
    counts_neighbors_list.append(counts_neighbors)
    counts_total_list.append(counts_ct + counts_neighbors)
#     print(counts_ct, counts_neighbors, crime)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
crimes['count_ct'] = counts_ct_list
crimes['count_neighbours'] = counts_neighbors_list
crimes['count_total'] = counts_total_list

In [8]:
crimes.to_csv('crimes_pastfrisk_counts.csv')