In [None]:
import pandas as pd
import geopandas as gp
from shapely.geometry import Point, Polygon, box

In [None]:
import numpy as np

In [None]:
%matplotlib inline

# get grids

In [None]:
def get_grids(shape, grid_size=200, crs=None):
    from shapely.geometry import Polygon, LineString
    do_intersect = False
    
    if isinstance(shape, tuple):
        if len(shape)==4:
            lon_min, lat_min, lon_max, lat_max = shape
        else:
            print('shape is a tuple, but its len != 4')
    elif isinstance(shape, LineString):
        if shape.is_closed:
            lon_min, lat_min, lon_max, lat_max = shape.bounds
            shape = Polygon(shape)
            do_intersect = True
        else:
            print('shape is LineString but not closed, which is not supported here')
    elif isinstance(shape, Polygon):
        lon_min, lat_min, lon_max, lat_max = shape.bounds
        do_intersect = True
    else:
        print('shape is not bbox tuple, closed LineString or Polygon')
        
    grid_lon, grid_lat = np.mgrid[lon_min:lon_max:grid_size, lat_min:lat_max:grid_size]
    grids_poly = []
    for j in range(grid_lat.shape[1]-1):
        for i in range(grid_lon.shape[0]-1):
            g = box(grid_lon[i,j], grid_lat[i,j], grid_lon[i+1,j+1], grid_lat[i+1,j+1])
            if do_intersect and not g.intersects(shape):
                continue
            grids_poly.append(g)
    
    grids = gp.GeoDataFrame(grids_poly).rename(columns={0: 'geometry'})
    grids['cxcy'] = grids.geometry.apply(lambda x: x.centroid.coords[0])
    if crs is not None:
        grids.crs = crs
    return grids

In [None]:
cityline = gp.read_file('data/open-baltimore/raw/Baltcity_20Line/baltcity_line.shp')
cityline = cityline.to_crs(epsg=3559)

In [None]:
grid_size=200
grids = get_grids(cityline.geometry[0], grid_size)
print(grids.shape)
grids.plot()

In [None]:
grids.crs= cityline.crs

# crime data

In [None]:
crimes_df = pd.read_csv('data/open-baltimore/raw/BPD_Part_1_Victim_Based_Crime_Data.csv')
crimes_df['geometry'] = crimes_df.apply(lambda x: Point(x.Longitude, x.Latitude), axis=1)

In [None]:
crimes_4326 = gp.GeoDataFrame(crimes_df[~crimes_df.Longitude.isnull()][['geometry', 'CrimeDate', 'CrimeTime', 'CrimeCode', 'Weapon']])
crimes_4326.crs = {'init': 'epsg:4326'}

In [None]:
crimes = crimes_4326.to_crs(epsg=3559)

In [None]:
# crimes['lon'] = crimes.geometry.apply(lambda x: x.coords[0][0])
# crimes['lat'] = crimes.geometry.apply(lambda x: x.coords[0][1])
crimes['lonlat'] = crimes.geometry.apply(lambda x: x.coords[0])

In [None]:
crimes['CrimeDate'] = pd.to_datetime(crimes['CrimeDate'], format='%m/%d/%Y')

In [None]:
crimes = crimes.reset_index().set_index('CrimeDate').sort_index()

In [None]:
dates = crimes.index.unique()

In [None]:
dates_bw = dates[dates<'2014-01-01']
dates_eval = dates[dates>='2014-01-01']

# KDE

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

In [None]:
def bandwidth_selection(data, pt_col='lonlat', bw_choice=None, cv=20):
    pts = data[pt_col].tolist()
    if bw_choice is None:
        bw_choice = np.linspace(10, 1000, 30)
    search = GridSearchCV(KernelDensity(), {'bandwidth': bw_choice}, cv=cv)
    search.fit(pts)
    print(search.best_params_)
    return search.best_params_['bandwidth']

In [None]:
crimes_slice = crimes.loc[dates[0]: dates[365]]

In [None]:
bw = bandwidth_selection(crimes.loc[dates_bw[:10]])

In [None]:
def rolling_window(data, pt_col='lonlat', time_window=60, verbose=True):
    """
    data: pd.DataFrame, index is DatetimeIndex, sorted by index.
    """
    dates = data.index.unique()
    if len(dates)<=time_window:
        raise ValueError('len of dates (%d) is less than time_window (%d)' % (len(dates), time_window))
    num_experiment = len(dates)-time_window
    if verbose:
        print('total number of experiment:', num_experiment)
    for i in range(num_experiment):
        test_date = dates[i+time_window]
        train_start_date = test_date - pd.Timedelta(days=time_window)
        train_end_date = test_date - pd.Timedelta(days=1)
        
        train = data.loc[train_start_date:train_end_date]
        test = data.loc[test_date]
#         print(train_start_date, train_end_date, test_date, train.index.nunique())
        yield train, test

In [None]:
def kde_rolling(data, grids, bw, pt_col='lonlat', return_test_dates=True, verbose=True):
    test_dates = []
    for train, test in rolling_window(data, verbose=verbose):
        test_date_str = test.index.unique()[0].strftime('%Y-%m-%d')
        test_dates.append(test_date_str)
        if verbose:
            print(train.shape, test.shape, test.index.unique()[0].strftime('%Y-%m-%d'))
        # kde
        kde = KernelDensity(bandwidth=bw)
        kde.fit(train[pt_col].tolist())
        pdf = np.exp(kde.score_samples(grids['cxcy'].tolist()))
        grids['density_'+test_date_str] = pdf
        # test in grids
        test_in_grids = gp.sjoin(test, grids)
        grids =grids.join(test_in_grids.groupby('index_right').agg({'index':'count'}), how='left')\
                        .rename(columns={'index':'num_crimes_'+test_date_str}).fillna(0)
    if return_test_dates:
        return grids, test_dates
    return grids

In [None]:
grids_with_result, test_dates = kde_rolling(crimes.loc[dates[:70]], grids.copy(), bw, return_test_dates=True, verbose=False)

# evaluation

In [None]:
aucs = []
for test_date_str in test_dates:
    grids_with_result.sort_values('density_'+test_date_str, ascending=False, inplace=True)
    hit_rate = grids_with_result['num_crimes_'+test_date_str].cumsum()/grids_with_result['num_crimes_'+test_date_str].sum()
    auc = hit_rate.iloc[idx_for_auc]
    auc.index = ['%d0%%' % (i+1) for i in range(10)]
    aucs.append(auc)
#     break

In [None]:
merge_aucs2 = pd.concat(aucs, axis=1)

In [None]:
merge_aucs = pd.concat(aucs,axis=1)

In [None]:
merge_aucs.mean(axis=1).plot()

In [None]:
merge_aucs.plot()