# Develop the crime data

This notebook will create the base grid used to train and test the crime prediction model. 
The grid will be created on a 150m x 150m grid, within the bounds of the police precincts polygon.

For each grid cell, the number of crimes will be calculated, disaggregated by the characteristics of the victim. The crimes affecting the following groups will be counted:
- All Male and Female victims
- Male victims
- Female victimes
- Children
- Elderly
- Female Children

## Import the required libraries

In [1]:
import geopandas as gpd
import altair as alt
import pandas as pd
import numpy as np

In [2]:
## Import the crimes data and the police precincts shapefile

In [3]:
precint_footprint = gpd.read_file('..//data//Police Precincts.geojson')

import pickle
crime_data = None

with open('..//data.pickle', 'rb') as f:
    crime_data = pd.read_pickle(f)

# For Testing only use a subset of the data
# crime_data = crime_data.sample(frac=0.01, random_state=1)



In [4]:
# create an altair chart of the precinct footprints
alt.Chart(precint_footprint).mark_geoshape().encode(
    color='precinct:N'
).properties(
    width=500,
    height=500
)

## Create the base grid for analysis

In [5]:
# Create a grid of points across the precincts to use as the center of the crime clusters
# The size of the grid is n x n, where n is the number of points in each direction
from shapely.geometry import Point, Polygon
from rtree import index
# get the bounds of the precincts
min_x, min_y, max_x, max_y = precint_footprint.total_bounds

idx = index.Index()
for i, row in precint_footprint.iterrows():
    idx.insert(i, row.geometry.bounds)

grid_size = 100  # You can adjust this value
x_points = np.linspace(min_x, max_x, grid_size)
y_points = np.linspace(min_y, max_y, grid_size)

# Round the points to 4 decimal places
x_points = np.around(x_points, 4)
y_points = np.around(y_points, 4)

grid = [Point(x, y) for x in x_points for y in y_points]

# drop the points that are not within the precincts
def is_point_inside_precincts(point, precincts_gdf, idx):
    for i in idx.intersection(point.bounds):
        if point.within(precincts_gdf.iloc[i].geometry):
            return True
    return False

filtered_grid = [point for point in grid if is_point_inside_precincts(point, precint_footprint, idx)]
filtered_grid_df = pd.DataFrame([(point.x, point.y) for point in filtered_grid], columns=['Longitude', 'Latitude'])
#
del grid 
del filtered_grid

In [6]:
import itertools

def point_hour_day_combinations(points, hours, days_of_week):
    for point, hour, day in itertools.product(points, hours, days_of_week):
        yield point.x, point.y, hour, day

# Convert filtered grid points to a list of Point objects
points = [Point(lon, lat) for lon, lat in filtered_grid_df[['Longitude', 'Latitude']].values]

# Define hours and days_of_week
hours = range(24)
days_of_week = range(7)



In [7]:
# Only keep the M and F sex values
sexes = ['M', 'F']

crime_data = crime_data.loc[crime_data['VIC_SEX'].isin(sexes)]
# Re calculate the categories for the Sex column
crime_data['VIC_SEX'] = crime_data['VIC_SEX'].cat.remove_unused_categories()

del sexes
crime_data.head(5)

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,OCC,OCC_END
0,506547392,03/29/2018,20:30:00,,,32.0,2018-03-30,351,CRIMINAL MISCHIEF & RELATED OF,254.0,...,40.810877,-73.941064,"(40.810877241, -73.941064151)",PATROL BORO MAN NORTH,,,WHITE,F,2018-03-29,
3,280364018,06/09/2018,21:42:00,06/09/2018,21:43:00,10.0,2018-06-10,361,OFF. AGNST PUB ORD SENSBLTY &,639.0,...,40.75931,-73.994706,"(40.759310399, -73.994706072)",PATROL BORO MAN SOUTH,,25-44,WHITE HISPANIC,F,2018-06-09,06/09/2018
8,570490441,01/16/2018,14:30:00,01/16/2018,15:00:00,69.0,2018-01-16,344,ASSAULT 3 & RELATED OFFENSES,101.0,...,40.635516,-73.913278,"(40.635516265, -73.913277993)",PATROL BORO BKLYN SOUTH,,,BLACK,M,2018-01-16,01/16/2018
9,377132404,08/04/2018,22:15:00,,,44.0,2018-08-04,344,ASSAULT 3 & RELATED OFFENSES,101.0,...,40.82617,-73.916831,"(40.826169612, -73.916830709)",PATROL BORO BRONX,,45-64,WHITE HISPANIC,F,2018-08-04,
10,504303130,09/26/2018,18:20:00,09/26/2018,18:24:00,28.0,2018-09-26,106,FELONY ASSAULT,109.0,...,40.803905,-73.952037,"(40.803905237, -73.952036608)",PATROL BORO MAN NORTH,,25-44,BLACK,M,2018-09-26,09/26/2018


In [8]:
# filter crime_data columns to only include the columns we need
crime_data = crime_data[['CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'Longitude','Latitude', 'VIC_SEX','VIC_AGE_GROUP']]
crime_data.head(5)

# get the day of the week and the hour of the day from the date and time columns
crime_data['CMPLNT_FR_DT'] = pd.to_datetime(crime_data['CMPLNT_FR_DT'], format='%m/%d/%Y')
crime_data['CMPLNT_FR_TM'] = pd.to_datetime(crime_data['CMPLNT_FR_TM'], format='%H:%M:%S')
crime_data['day_of_week'] = crime_data['CMPLNT_FR_DT'].dt.dayofweek
crime_data['hour_of_day'] = crime_data['CMPLNT_FR_TM'].dt.hour

In [9]:
def nearest_grid_point(coord, grid_points):
    nearest_index = np.argmin(np.abs(coord - grid_points))
    return grid_points[nearest_index]

# Find the nearest grid point for the longitude and latitude coordinates using the defined function
crime_data['Longitude'] = crime_data['Longitude'].apply(nearest_grid_point, grid_points=x_points)
crime_data['Latitude'] = crime_data['Latitude'].apply(nearest_grid_point, grid_points=y_points)

# 'hour_of_day',

crime_counts = (
    crime_data.groupby(['Longitude', 'Latitude', 'day_of_week', 'hour_of_day', 'VIC_SEX'])
    .size()
    .reset_index(name='count')
)
crime_counts.head(5)

Unnamed: 0,Longitude,Latitude,day_of_week,hour_of_day,VIC_SEX,count
0,-74.2556,40.4961,0,0.0,F,30
1,-74.2556,40.4961,0,0.0,M,34
2,-74.2556,40.4961,0,1.0,F,22
3,-74.2556,40.4961,0,1.0,M,24
4,-74.2556,40.4961,0,2.0,F,17


In [10]:
crime_counts_pivoted = pd.pivot_table(crime_counts, values='count', index=['Longitude', 'Latitude','hour_of_day', 'day_of_week'], columns=['VIC_SEX'], fill_value=0)

crime_counts_pivoted = crime_counts_pivoted.reset_index()
# Remove the index 


del crime_counts
crime_counts_pivoted.head(5)

VIC_SEX,Longitude,Latitude,hour_of_day,day_of_week,F,M
0,-74.2556,40.4961,0.0,0,30,34
1,-74.2556,40.4961,0.0,1,35,47
2,-74.2556,40.4961,0.0,2,43,42
3,-74.2556,40.4961,0.0,3,39,40
4,-74.2556,40.4961,0.0,4,40,45


## Trees

In [11]:
# Count the trees close to each grid point
rows = None

trees = gpd.read_file('../data/2015 Street Tree Census - Tree Data.geojson', rows=rows)
trees = trees.to_crs(epsg=4326)

In [12]:
trees['longitude'] = trees['longitude'].astype(float)
trees['latitude'] = trees['latitude'].astype(float)

trees['Longitude'] = trees['longitude'].apply(nearest_grid_point, grid_points=x_points)
trees['Latitude'] = trees['latitude'].apply(nearest_grid_point, grid_points=y_points)

In [13]:
trees.head(5)

Unnamed: 0,nta,health,zipcode,latitude,nta_name,state,trnk_wire,y_sp,brnch_othe,root_grate,...,root_other,created_at,borocode,block_id,trnk_light,tree_dbh,root_stone,geometry,Longitude,Latitude
0,QN17,Fair,11375,40.723092,Forest Hills,New York,No,202756.768749,No,No,...,No,08/27/2015,4,348711,No,3,No,POINT (-73.84422 40.72309),-73.8459,40.7249
1,QN49,Fair,11357,40.794111,Whitestone,New York,No,228644.837379,No,No,...,No,09/03/2015,4,315986,No,21,Yes,POINT (-73.81868 40.79411),-73.8179,40.7927
2,BK90,Good,11211,40.717581,East Williamsburg,New York,No,200716.891267,No,No,...,No,09/05/2015,3,218365,No,3,No,POINT (-73.93661 40.71758),-73.9357,40.7164
3,BK90,Good,11211,40.713537,East Williamsburg,New York,No,199244.253136,No,No,...,No,09/05/2015,3,217969,No,10,Yes,POINT (-73.93446 40.71354),-73.9357,40.7122
4,BK37,Good,11215,40.666778,Park Slope-Gowanus,New York,No,182202.425999,No,No,...,No,08/30/2015,3,223043,No,21,Yes,POINT (-73.97598 40.66678),-73.975,40.6656


In [14]:
tree_counts = (
    trees.groupby(['Longitude', 'Latitude','health'])
    .size()
    .reset_index(name='tree_count')
)

In [15]:
tree_counts.head()

tree_counts_pivoted = pd.pivot_table(tree_counts, values='tree_count', index=['Longitude', 'Latitude'], columns=['health'], fill_value=0)
tree_counts_pivoted = tree_counts_pivoted.reset_index().rename(columns={'Good': 'good_tree_count', 'Fair': 'fair_tree_count', 'Poor': 'poor_tree_count'})

# Remove index name
tree_counts_pivoted.index.name = None


In [16]:
all_combinations = np.array(list(point_hour_day_combinations(points, hours, days_of_week)))

In [17]:
# Use the reindex method to index the matches directly
crime_counts_pivoted.set_index(['Longitude', 'Latitude', 'hour_of_day', 'day_of_week'], inplace=True)

matched_data = crime_counts_pivoted.reindex(
    pd.MultiIndex.from_arrays(all_combinations.T, names=crime_counts_pivoted.index.names),
    fill_value=0
)

# Reset the index of the matched_data DataFrame
matched_data.reset_index(inplace=True)

# Create the merged_df DataFrame from the matched_data DataFrame

In [18]:
matched_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589176 entries, 0 to 589175
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Longitude    589176 non-null  float64
 1   Latitude     589176 non-null  float64
 2   hour_of_day  589176 non-null  float64
 3   day_of_week  589176 non-null  float64
 4   F            589176 non-null  int64  
 5   M            589176 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 27.0 MB


In [19]:
# Add the tree counts to the matched_df DataFrame
matched_data = matched_data.merge(tree_counts_pivoted, on=['Longitude', 'Latitude'], how='left').fillna(0)

In [20]:
# Divide the F and M columns by 52*18 to get the average number of crimes per hour per day
# matched_data[['F', 'M']] = matched_data[['F', 'M']].div(52*18)
# convert the hour_of_day and day_of_week columns to int type
matched_data[['hour_of_day', 'day_of_week']] = matched_data[['hour_of_day', 'day_of_week']].astype(int)

#Convert the tree count columns to int type
matched_data[['good_tree_count', 'fair_tree_count', 'poor_tree_count']] = matched_data[['good_tree_count', 'fair_tree_count', 'poor_tree_count']].astype(int)

matched_data.head(5)

Unnamed: 0,Longitude,Latitude,hour_of_day,day_of_week,F,M,fair_tree_count,good_tree_count,poor_tree_count
0,-74.25,40.5004,0,0,0.0,0.001068,20,154,0
1,-74.25,40.5004,0,1,0.0,0.0,20,154,0
2,-74.25,40.5004,0,2,0.0,0.001068,20,154,0
3,-74.25,40.5004,0,3,0.001068,0.0,20,154,0
4,-74.25,40.5004,0,4,0.0,0.0,20,154,0


## Final Export

In [21]:
# Save the matched_data DataFrame as a parquet file
matched_data.to_parquet('../data/matched_data.parquet', index=False)