In [1]:
import numpy as np
import pandas as pd
import os
import datetime
from geopy import distance as geo
import dask.dataframe as dd
from dask.multiprocessing import get
# from scipy.spatial import distance

In [2]:
crimes = pd.read_csv('./crimes_data_for_matching.csv',skiprows=0,header=0)

In [3]:
crimes.head()

Unnamed: 0,uid,case_number,date,primary_type,latitude,longitude
0,10364596,HY556489,2015-12-31 18:45:00-06,BATTERY,41.894898,-87.759702
1,10364616,HY556411,2015-12-31 16:28:00-06,NARCOTICS,41.964437,-87.662375
2,10364656,HY556590,2015-12-31 22:05:00-06,BATTERY,41.905562,-87.707614
3,10364664,HY556241,2015-12-31 15:15:00-06,ASSAULT,41.885654,-87.754346
4,10364742,HY556615,2015-12-31 23:15:00-06,BATTERY,41.679862,-87.621982


In [4]:
calldata = pd.read_csv('./call_data_for_matching.csv',skiprows=0,header=0)

In [5]:
calldata.head()

Unnamed: 0,event_number,x,y,entrydate,inittype,fintype
0,1600100965,-87.595175,41.765082,2016-01-01 00:54:11-06,SUSP,SUSPER
1,1600100966,-87.643028,41.765225,2016-01-01 00:54:15-06,CELLHU,CELLHU
2,1600100967,-87.753893,41.929623,2016-01-01 00:54:15-06,SHOTSF,SHOTSF
3,1600100968,-87.618541,41.889492,2016-01-01 00:54:18-06,CELLHU,CELLHU
4,1600100969,-87.638053,41.923793,2016-01-01 00:54:22-06,CHECWB,CHECWB


In [None]:
calldata.entrydate = pd.to_datetime(calldata.entrydate,errors='coerce')
crimes.date = pd.to_datetime(crimes.date,errors='coerce')

In [None]:
calldata.head()

In [None]:
crimes.head()

In [None]:
# all the time have been translated to UTC
# But that is fine.
##################################
# Change the column name of call data for further work
calldata.columns 

In [None]:
calldata.columns = ['event_number', 'longitude', 'latitude', 'entrydate', 'inittype', 'fintype']

In [None]:
calldata.head()

In [None]:
crimes = crimes.dropna(subset=['latitude','longitude'])
calldata = calldata.dropna(subset=['latitude','longitude'])

In [None]:
calldata = calldata.drop(columns=['inittype', 'fintype'])

In [None]:
calldata.count().sum()

In [None]:
usedata = calldata.sample(frac=0.05, replace=False)


In [None]:
ddata = dd.from_pandas(usedata, npartitions=12)

In [None]:
time_penalty = 0
time_window = 3600
distance_cutoff = 300
combine_cutoff =300
time_diff = datetime.timedelta(seconds=time_window)

In [None]:
def find_the_matching(calldata,crimesdata,time_penalty,time_diff,distance_cutoff,combine_cutoff):
    if np.isnan(calldata['latitude'] ):
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    if calldata['latitude'] >50 or calldata['latitude']<30 or calldata['longitude']>-80 or calldata['longitude']<-100:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    lat_lon_sum = calldata['latitude'] +calldata['longitude']
    matching_temp = crimesdata[(crimesdata['date']<=calldata['entrydate']+time_diff) & (crimesdata['date']>= calldata['entrydate']-time_diff) & (crimesdata['latitude']+crimesdata['longitude']>= lat_lon_sum-0.1) & (crimesdata['latitude']+crimesdata['longitude']<= lat_lon_sum+0.1)]
    if matching_temp.count().sum()==0:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    idx = matching_temp.apply(lambda x: geo.distance((x['latitude'],x['longitude']),(calldata['latitude'],calldata['longitude'])).m
                                  + np.abs(time_penalty*(x['date']-calldata['entrydate']).seconds/3600  ),axis =1 ).idxmin()
    return_data = matching_temp.loc[idx]
    geo_dis = geo.distance((return_data['latitude'],return_data['longitude']),(calldata['latitude'],calldata['longitude'])).m 
    diff = np.abs((return_data['date']-calldata['entrydate']).seconds/60 + (return_data['date']-calldata['entrydate']).days*1440)
    # time diff in mins
    if geo_dis > distance_cutoff:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    if geo.distance((return_data['latitude'],return_data['longitude']),(calldata['latitude'],calldata['longitude'])).m + diff/60 * time_penalty >combine_cutoff:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    return pd.Series([calldata['event_number'],return_data['uid'],geo_dis,diff])
                                                                                                                
 

def find_the_matching(calldata,crimesdata,time_penalty,time_diff,distance_cutoff,combine_cutoff):
    if np.isnan(calldata['latitude'] ):
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    if calldata['latitude'] >50 or calldata['latitude']<30 or calldata['longitude']>-80 or calldata['longitude']<-100:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    matching_temp = crimesdata[(crimesdata['date']<=calldata['entrydate']+time_diff) & (crimesdata['date']>= calldata['entrydate']-time_diff)
                              & (crimesdata['latitude']<=calldata['latitude']+0.05) & (crimesdata['latitude']>=calldata['latitude']-0.05)
                              & (crimesdata['longitude']<=calldata['longitude']+0.05) & (crimesdata['longitude']>=calldata['longitude']-0.05)]
    if matching_temp.count()==0:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    idx = matching_temp.apply(lambda x: geo.distance((x['latitude'],x['longitude']),(calldata['latitude'],calldata['longitude'])).m
                                  + np.abs(time_penalty*(x['date']-calldata['entrydate']).seconds/3600  ),axis =1 ).idxmin()
    return_data = matching_temp.loc[idx]
    geo_dis = geo.distance((return_data['latitude'],return_data['longitude']),(calldata['latitude'],calldata['longitude'])).m 
    diff = np.abs((return_data['date']-calldata['entrydate']).seconds/60 + (return_data['date']-calldata['entrydate']).days*1440)
    # time diff in mins
    if geo_dis > distance_cutoff:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    if geo.distance((return_data['latitude'],return_data['longitude']),(calldata['latitude'],calldata['longitude'])).m + diff/60 * time_penalty >combine_cutoff:
        return pd.Series([calldata['event_number'],np.nan,np.nan,np.nan])
    
    return pd.Series([calldata['event_number'],return_data['uid'],geo_dis,diff])
                                                                                                                
 

In [None]:
res=ddata.map_partitions(lambda df: df.apply((lambda x: find_the_matching(x,crimes,time_penalty,time_diff,distance_cutoff,combine_cutoff)), axis=1)).compute(get=get)

In [None]:
# calldata[['crimes_uid','distance','time_diff']]=calldata.apply(find_the_matching,args=[crimes,time_penalty,time_diff,distance_cutoff,combine_cutoff],axis =1)

In [None]:
res.dropna()

In [None]:
res.head()

In [None]:
res.to_csv('./out.csv')