In [1]:
import geopandas as gpd
import altair as alt
import seaborn as sns
import pandas as pd
import scipy.stats as stats

In [2]:

import pickle

with open('data.pickle', 'rb') as f:
    data = pickle.load(f, encoding='binary')

data.head(5)

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,OCC,OCC_END
0,506547392,03/29/2018,20:30:00,,,32.0,2018-03-30,351,CRIMINAL MISCHIEF & RELATED OF,254.0,...,40.810877,-73.941064,"(40.810877241, -73.941064151)",PATROL BORO MAN NORTH,,,WHITE,F,2018-03-29,
1,629632833,02/06/2018,23:15:00,,,52.0,2018-02-07,341,PETIT LARCENY,333.0,...,40.873671,-73.908014,"(40.873671035, -73.908013649)",PATROL BORO BRONX,,45-64,UNKNOWN,D,2018-02-06,
2,787203902,11/21/2018,00:15:00,11/21/2018,00:20:00,75.0,2018-11-21,341,PETIT LARCENY,321.0,...,40.651782,-73.885457,"(40.651782232, -73.885456761)",PATROL BORO BKLYN NORTH,,25-44,UNKNOWN,D,2018-11-21,11/21/2018
3,280364018,06/09/2018,21:42:00,06/09/2018,21:43:00,10.0,2018-06-10,361,OFF. AGNST PUB ORD SENSBLTY &,639.0,...,40.75931,-73.994706,"(40.759310399, -73.994706072)",PATROL BORO MAN SOUTH,,25-44,WHITE HISPANIC,F,2018-06-09,06/09/2018
4,985800320,11/10/2018,19:40:00,11/10/2018,19:45:00,19.0,2018-11-10,341,PETIT LARCENY,333.0,...,40.764536,-73.970728,"(40.764535539, -73.970728388)",PATROL BORO MAN NORTH,,<18,UNKNOWN,D,2018-11-10,11/10/2018


In [3]:
# # Filtered the data to only include the year 2015
# data = data.loc[data['OCC'].dt.year == 2016]
# data = data.loc[data['VIC_SEX'] != 'D']
# data = data.loc[data['VIC_SEX'] != 'E']
# data.info()


# Add a column for the year
data['Year'] = data['OCC'].dt.year

In [4]:
# Create a box-plot of the number of crimes per precinct
crime_per_precinct = data.groupby(['ADDR_PCT_CD']).size().reset_index(name='Count')

# Create a box-plot of the number of crimes per precinct using seaborn
alt.Chart(crime_per_precinct).mark_boxplot().encode(
    y = 'Count:Q'
)


In [5]:

precint_counts = data.groupby(['ADDR_PCT_CD','Year']).size().reset_index(name='crime_count')
precint_counts.head(5)

# convert ADDR_PCT_CD to int
precint_counts['ADDR_PCT_CD'] = precint_counts['ADDR_PCT_CD'].astype(int)
precint_counts['ADDR_PCT_CD'] = precint_counts['ADDR_PCT_CD'].astype(str)
precint_counts.head(5)

Unnamed: 0,ADDR_PCT_CD,Year,crime_count
0,1,2006,6288
1,1,2007,6780
2,1,2008,6617
3,1,2009,6225
4,1,2010,5876


In [6]:
# Create a Streamgraph to show the changes in crimes over years
alt.Chart('precint_counts').mark_area().encode(
    alt.X('Year:O', axis = alt.Axis(title = 'Year')),
    alt.Y('crime_count:Q', stack = 'center'),
    alt.Color('precint:O')
).properties(
    width = 2000,
    height = 1000
)

In [7]:
# load precinct shapefile
precint_shape = gpd.read_file('..//data//Police Precincts.geojson')

In [8]:
precint_shape

Unnamed: 0,precinct,shape_area,shape_leng,geometry
0,1,47286422.9826,80283.5387782,"MULTIPOLYGON (((-74.04388 40.69019, -74.04351 ..."
1,5,18094527.4385,18807.1249114,"MULTIPOLYGON (((-73.98864 40.72293, -73.98869 ..."
2,6,22017946.5474,24875.9642171,"MULTIPOLYGON (((-73.99968 40.73855, -73.99684 ..."
3,7,18366669.928,17287.5444926,"MULTIPOLYGON (((-73.97346 40.71896, -73.97357 ..."
4,9,21395386.2669,19772.5107407,"MULTIPOLYGON (((-73.97161 40.72672, -73.97163 ..."
...,...,...,...,...
72,115,114119713.7,60059.4107576,"MULTIPOLYGON (((-73.85892 40.76241, -73.85931 ..."
73,120,232327994.252,92945.6971545,"MULTIPOLYGON (((-74.05357 40.60370, -74.05407 ..."
74,121,475577637.857,136811.464647,"MULTIPOLYGON (((-74.15946 40.64145, -74.15975 ..."
75,122,454852053.697,154842.385071,"MULTIPOLYGON (((-74.05051 40.56642, -74.05047 ..."


In [9]:
# merge the two dataframes
precint_shape = precint_shape.merge(precint_counts, left_on='precinct', right_on='ADDR_PCT_CD')

precint_shape.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1232 entries, 0 to 1231
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   precinct     1232 non-null   object  
 1   shape_area   1232 non-null   object  
 2   shape_leng   1232 non-null   object  
 3   geometry     1232 non-null   geometry
 4   ADDR_PCT_CD  1232 non-null   object  
 5   Year         1232 non-null   int64   
 6   crime_count  1232 non-null   int64   
dtypes: geometry(1), int64(2), object(4)
memory usage: 77.0+ KB


In [None]:
# plot the data using column 'counts' as the color and with one map for each year
alt.Chart(precint_shape).mark_geoshape().encode(
    color='crime_count:Q',
    row='Year:Q'
).properties(
    width=500,
    height=300,
)

In [None]:
precint_w_trees = gpd.read_file('..//data//precincts_with_trees.geojson')

# plot the data using column 'counts' as the color
alt.Chart(precint_w_trees).mark_geoshape().encode(
    color='tree_count:Q'
).properties(
    width=500,
    height=300
)


In [None]:
# merge the trees and crime data
precint_w_trees = precint_w_trees.merge(precint_counts, left_on='precinct', right_on='ADDR_PCT_CD')

In [None]:
# create a box plot of the crime count and the tree count
alt.Chart(precint_w_trees).mark_boxplot().encode(
    y='crime_count:Q'
)

In [None]:
# create a scatter plot of the data with the number of trees on the x axis and the number of crimes on the y axis
alt.Chart(precint_w_trees).mark_circle(
    size=50,
    color='red',
).encode(
    x = alt.X('tree_count', axis= alt.Axis(title = 'Trees')),
    y = alt.Y('crime_count', axis= alt.Axis(title = 'Crimes')),
    tooltip = ['precinct', 'tree_count', 'crime_count'],
).properties(
    width=1000,
    height=500,
    title='Number of Crimes vs Number of Trees'
)

In [None]:
stats.pearsonr(precint_w_trees['tree_count'], precint_w_trees['crime_count'])

In [None]:
# get he area of each precinct
precint_w_trees.to_crs(epsg=2263, inplace=True)
precint_w_trees['area'] = precint_w_trees['geometry'].area

precint_w_trees['area'].head(5)

In [None]:
# calculate the number of trees per square kilometer and add it to the dataframe
precint_w_trees['tree_density'] = precint_w_trees['tree_count'] / (precint_w_trees['area'] / 1000000)
precint_w_trees['crime_density'] = precint_w_trees['crime_count'] / (precint_w_trees['area'] / 1000000)

In [None]:
# create a scatter plot of the data with the density of trees on the x axis and the density of crimes on the y axis
fig = alt.Chart(precint_w_trees).mark_circle(
    size=50,
    color='red',
).encode(
    x = alt.X('tree_density:Q', axis= alt.Axis(title = 'Trees')),
    y = alt.Y('crime_density:Q',axis= alt.Axis(title = 'Crimes per sq.km')),
    tooltip = ['precinct', 'tree_count', 'crime_count'],
).properties(
    width=1000,
    height=500,
    title='Number of Crimes vs Number of Trees'
)

final_plot = fig + fig.transform_regression('tree_density','crime_density').mark_line()
final_plot

In [None]:
stats.pearsonr(precint_w_trees['tree_density'], precint_w_trees['crime_density'])

In [None]:

alt.Chart(precint_w_trees).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
).properties(
    width=300,
    height=300
).repeat(
    row=['tree_density', 'crime_density', 'area'],
    column=['tree_density', 'crime_density', 'area']
)

In [None]:
# create a box plot of the crime count and the tree count
bplot_crimedensity = alt.Chart(precint_w_trees).mark_boxplot().encode(
    y='crime_density'
)

b_plot_treedensity = alt.Chart(precint_w_trees).mark_boxplot().encode(
    y='tree_density'
)

bplot_crimedensity | b_plot_treedensity

