In [2]:
import json
import requests
import pandas as pd
from sodapy import Socrata
import numpy as np
import urllib.request
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'browser'
from urllib.request import urlopen
#import plotly.graph_objs as go
#from urllib.request import urlopen

# Introduction

This notebook aims to explore trends in recent data on motor vehicle collisions. Project members can have a fuller understanding of the content, timing, frequency, and location of collisions. This notebook seeks to contribute to that understanding through the exploratory analysis and visualization of vehicle collision data. 

# Data Source

The data used in this notebook was obtained from: 

- [NYC Open Data's Motor Vehicle Collision-Crashes](https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95)
  - This dataset contains information from all police reported motor vehicle collisions in NYC. Each row represents a crash event.The police report (MV104-AN) is required to be filled out for collisions where someone is injured or killed, or where there is at least 1000 dollars worth of damage. This notebook uses a subset of the data and was accessed with the [Socrata Open Data (SODA) API](https://dev.socrata.com/consumers/getting-started.html). 
 

# Accessing Data
Items needed to obtain data:
- Create app token
- Get domain name
- Get data identifier

In [3]:
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'h9gi-nx95'
client = Socrata(socrata_domain, app_token="zrfGJViqm8hhU7PFyRqvPZcyF")
client.timeout = 1000

In [4]:
# Pull data from Socrata API
results = client.get(socrata_dataset_identifier, limit = 400000)


In [5]:
# Convert data into a pandas dataframe
df = pd.DataFrame(results)

In [6]:
dff = df.copy()

In [7]:
dff.shape

(400000, 29)

In [8]:
dff.head()

Unnamed: 0,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,longitude,location,cross_street_name,off_street_name,contributing_factor_vehicle_3,contributing_factor_vehicle_4,vehicle_type_code_3,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-04-14T00:00:00.000,5:32,BRONX WHITESTONE BRIDGE,0,0,0,0,0,0,0,...,,,,,,,,,,
1,2021-04-13T00:00:00.000,21:35,,1,0,1,0,0,0,0,...,-73.97617,"{'latitude': '40.68358', 'longitude': '-73.976...",620 ATLANTIC AVENUE,,,,,,,
2,2021-04-15T00:00:00.000,16:15,HUTCHINSON RIVER PARKWAY,0,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-04-13T00:00:00.000,16:00,VANDERVORT AVENUE,0,0,0,0,0,0,0,...,,,,ANTHONY STREET,,,,,,
4,2021-04-12T00:00:00.000,8:25,EDSON AVENUE,0,0,0,0,0,0,0,...,0.0,"{'latitude': '0.0', 'longitude': '0.0'}",,,,,,,,


In [9]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   crash_date                     400000 non-null  object
 1   crash_time                     400000 non-null  object
 2   on_street_name                 297762 non-null  object
 3   number_of_persons_injured      399999 non-null  object
 4   number_of_persons_killed       400000 non-null  object
 5   number_of_pedestrians_injured  400000 non-null  object
 6   number_of_pedestrians_killed   400000 non-null  object
 7   number_of_cyclist_injured      400000 non-null  object
 8   number_of_cyclist_killed       400000 non-null  object
 9   number_of_motorist_injured     400000 non-null  object
 10  number_of_motorist_killed      400000 non-null  object
 11  contributing_factor_vehicle_1  398315 non-null  object
 12  contributing_factor_vehicle_2  321549 non-nu

In [10]:
# Eliminate rows witout lat/lon 
dff = dff[dff.longitude.notnull()]
dff = dff[dff.latitude.notnull()]


In [11]:
# Exclude unnecessary columns
dff = dff[['crash_date', 'crash_time', 'number_of_persons_killed', 'number_of_pedestrians_injured', 'number_of_pedestrians_killed', 'number_of_cyclist_injured','number_of_cyclist_killed', 'number_of_motorist_injured', 'number_of_motorist_killed', 'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2', 'collision_id', 'vehicle_type_code1', 'vehicle_type_code2', 'borough', 'zip_code', 'latitude', 'longitude', 'cross_street_name', 'off_street_name']]

In [12]:
dff.columns

Index(['crash_date', 'crash_time', 'number_of_persons_killed',
       'number_of_pedestrians_injured', 'number_of_pedestrians_killed',
       'number_of_cyclist_injured', 'number_of_cyclist_killed',
       'number_of_motorist_injured', 'number_of_motorist_killed',
       'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
       'collision_id', 'vehicle_type_code1', 'vehicle_type_code2', 'borough',
       'zip_code', 'latitude', 'longitude', 'cross_street_name',
       'off_street_name'],
      dtype='object')

#  Overview of Vehicles Involved in Collitions



Now lets compute the percentage of vehicle types involve in crashes
- Vehicle 1

In [13]:
v1 = dff.vehicle_type_code1
counts = v1.value_counts()
percent = v1.value_counts(normalize=True)
percent100 = percent.mul(100).round(1).astype(str) + '%'
vehicle1_df = pd.DataFrame({'counts': counts, 'percent': percent, 'percent100': percent100})

In [14]:
vehicle1_df.head(10)

Unnamed: 0,counts,percent,percent100
Sedan,168930,0.464167,46.4%
Station Wagon/Sport Utility Vehicle,134196,0.368729,36.9%
Taxi,13258,0.036429,3.6%
Pick-up Truck,9209,0.025303,2.5%
Box Truck,6928,0.019036,1.9%
Bus,5808,0.015959,1.6%
Bike,4025,0.011059,1.1%
Tractor Truck Diesel,2663,0.007317,0.7%
Motorcycle,2451,0.006735,0.7%
Van,2247,0.006174,0.6%


- Vehicle 2 

In [15]:
v2 = dff.vehicle_type_code2
# s = df.keywords
counts2 = v2.value_counts()
percent2 = v2.value_counts(normalize=True)
percent2_100 = v2.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
vehicle2_df = pd.DataFrame({'counts': counts2, 'percent': percent2, 'percent100': percent2_100})

In [16]:
vehicle2_df.head(10)

Unnamed: 0,counts,percent,percent100
Sedan,112080,0.422755,42.3%
Station Wagon/Sport Utility Vehicle,90725,0.342206,34.2%
Bike,10761,0.040589,4.1%
Taxi,8377,0.031597,3.2%
Box Truck,8095,0.030534,3.1%
Pick-up Truck,7986,0.030122,3.0%
Bus,4985,0.018803,1.9%
Tractor Truck Diesel,2637,0.009947,1.0%
E-Bike,2422,0.009136,0.9%
Motorcycle,2132,0.008042,0.8%


The crash_date columns currently contains string values



In [17]:
type(df.crash_date[1])

str

The pandas.to_datetime() method converts each string value to a datetime object below

In [18]:
dff['crash_date'] = pd.to_datetime(dff['crash_date'], format='%Y-%m-%dT%H:%M:%S.%f')


In [19]:
type(dff.crash_date[1])

pandas._libs.tslibs.timestamps.Timestamp

In [20]:
# Create year, month, day_of_week columns
dff['year'] = dff['crash_date'].dt.year
dff['month'] = [str(i.month) for i in dff.crash_date]
dff['day_of_week'] = [int(i.weekday()) for i in dff.crash_date]


In [21]:
# Create hour column
dff['hour'] = pd.to_datetime(dff['crash_time'], format='%H:%M').dt.hour

In [22]:
dff.head()

Unnamed: 0,crash_date,crash_time,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,...,borough,zip_code,latitude,longitude,cross_street_name,off_street_name,year,month,day_of_week,hour
1,2021-04-13,21:35,0,1,0,0,0,0,0,Unspecified,...,BROOKLYN,11217.0,40.68358,-73.97617,620 ATLANTIC AVENUE,,2021,4,1,21
4,2021-04-12,8:25,0,0,0,0,0,0,0,Unspecified,...,,,0.0,0.0,,,2021,4,0,8
13,2019-05-21,22:50,0,0,0,0,0,0,0,Passing or Lane Usage Improper,...,BROOKLYN,11201.0,40.69754,-73.98312,,CONCORD STREET,2019,5,1,22
15,2021-02-26,14:50,0,0,0,0,0,0,0,Unspecified,...,BRONX,10461.0,40.843464,-73.836,2819 MIDDLETOWN ROAD,,2021,2,4,14
16,2021-03-09,11:00,0,0,0,0,0,1,0,Following Too Closely,...,,,40.692547,-73.990974,,JORALEMON STREET,2021,3,1,11


In [21]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 740584 entries, 1 to 799999
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_date                     740584 non-null  datetime64[ns]
 1   crash_time                     740584 non-null  object        
 2   number_of_persons_killed       740567 non-null  object        
 3   number_of_pedestrians_injured  740584 non-null  object        
 4   number_of_pedestrians_killed   740584 non-null  object        
 5   number_of_cyclist_injured      740584 non-null  object        
 6   number_of_cyclist_killed       740584 non-null  object        
 7   number_of_motorist_injured     740584 non-null  object        
 8   number_of_motorist_killed      740584 non-null  object        
 9   contributing_factor_vehicle_1  737770 non-null  object        
 10  contributing_factor_vehicle_2  611377 non-null  object        
 11  

# Visualization of collisions with fatalities in 2021

In [23]:
# Filter out null values
dff = dff[dff.number_of_persons_killed.notnull()]
# Convert person killed column value to int
dff["number_of_persons_killed"] = pd.to_numeric(dff["number_of_persons_killed"])

In [24]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 367380 entries, 1 to 399999
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_date                     367380 non-null  datetime64[ns]
 1   crash_time                     367380 non-null  object        
 2   number_of_persons_killed       367380 non-null  int64         
 3   number_of_pedestrians_injured  367380 non-null  object        
 4   number_of_pedestrians_killed   367380 non-null  object        
 5   number_of_cyclist_injured      367380 non-null  object        
 6   number_of_cyclist_killed       367380 non-null  object        
 7   number_of_motorist_injured     367380 non-null  object        
 8   number_of_motorist_killed      367380 non-null  object        
 9   contributing_factor_vehicle_1  365791 non-null  object        
 10  contributing_factor_vehicle_2  294726 non-null  object        
 11  

In [25]:
dff = dff.astype({"latitude": np.float64, "longitude": np.float64})

In [26]:
# Condition to filter year and records with person killed
fatal_2021_cond = (dff['year'] == 2021 ) & (dff['number_of_persons_killed']>=1)

In [28]:
# Create new DataFrame that meets conditions
df_2021_p_killed = dff[fatal_2021_cond]

In [29]:
df_2021_p_killed.head()

Unnamed: 0,crash_date,crash_time,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,...,borough,zip_code,latitude,longitude,cross_street_name,off_street_name,year,month,day_of_week,hour
150,2021-04-15,15:18,1,0,1,0,0,0,0,Driver Inattention/Distraction,...,BROOKLYN,11209.0,40.620487,-74.029305,,FOREST PLACE,2021,4,3,15
1143,2021-04-18,23:56,1,0,1,0,0,0,0,Unspecified,...,,,40.770527,-73.98012,,,2021,4,6,23
1337,2021-04-08,19:55,1,0,0,0,0,0,0,Driver Inexperience,...,BRONX,10459.0,40.830307,-73.89873,1281 UNION AVENUE,,2021,4,3,19
1638,2021-04-19,19:13,1,0,1,0,0,0,0,Driver Inattention/Distraction,...,MANHATTAN,10039.0,40.82289,-73.94208,300 WEST 145 STREET,,2021,4,0,19
2075,2021-04-22,8:01,1,0,1,0,0,0,0,Failure to Yield Right-of-Way,...,MANHATTAN,10035.0,40.803698,-73.937912,,LEXINGTON AVENUE,2021,4,3,8


In [30]:
px.set_mapbox_access_token(open(".mapbox_token").read())

In [31]:
# Scatter plot of crash with fatalities (color by hour and size by number of persons killed)
p_killed_2021_fig = px.scatter_mapbox(df_2021_p_killed,
                                      lat="latitude",
                                      lon="longitude",
                                      color="hour",
                                      size="number_of_persons_killed",
                                      #hover_data = "off_street_name",
                                      mapbox_style="streets",
                                      color_continuous_scale=px.colors.cyclical.IceFire,
                                      size_max=15,
                                      zoom=10,
                                      title = "Crash Reports with Fatalities in 2021",)

In [32]:
p_killed_2021_fig.show()

# Visualization of Collition Resulting in Persons Injured (2021)

In [33]:
# Convert person injured columns to int

In [40]:
injured_list = ["number_of_motorist_injured", "number_of_cyclist_injured","number_of_pedestrians_injured"]

In [41]:
injured_list

['number_of_motorist_injured',
 'number_of_cyclist_injured',
 'number_of_pedestrians_injured']

In [42]:
dff[injured_list] = dff[injured_list].apply(pd.to_numeric)

In [44]:
# Create persons_injured column
dff['number_persons_injured'] = dff['number_of_pedestrians_injured'] + dff['number_of_motorist_injured'] + dff['number_of_cyclist_injured']

In [45]:
# Condition includes year 2021 and events with persons injured
df_injured_cond = (dff['year'] == 2021 ) & (dff['number_persons_injured']>=1)
#injured_cond = (dff['year'] == 2021 ) | (dff['number_of_pedestrians_injured']>=1) | (dff['number_of_cyclist_injured']>=1) | (dff['number_of_motorist_injured']>=1)

In [46]:
# New dataFrame includes person injured from collision (2021)
df_2021_p_injured = dff[df_injured_cond]

In [60]:
# Scatter plot of crash with injuries (color by hour and size by number of persons injured)
p_injured_2021_fig = px.scatter_mapbox(df_2021_p_injured,
                                       lat="latitude",
                                       lon="longitude",
                                       color="hour",
                                       size="number_persons_injured",
                                       color_continuous_scale=px.colors.cyclical.IceFire,
                                       size_max=15,
                                       zoom=10,
                                       center={"lat": 40.730610, "lon": -73.9749},
                                       title = "Crash Reports with Persons Injured in 2021",)
p_injured_2021_fig.show()

# Visualization of Collitions by Borough

In [41]:
boro_url = ('https://raw.githubusercontent.com/codeforgermany/click_that_hood/main/public/data/new-york-city-boroughs.geojson')

In [42]:
nyc_boro = requests.get(boro_url)
nyc_boro = nyc_boro.json()

In [43]:
nyc_boro['features'][1]['properties']


{'name': 'Queens',
 'cartodb_id': 2,
 'created_at': '2013-03-09T02:42:03.692Z',
 'updated_at': '2013-03-09T02:42:03.989Z'}

In [44]:
print(dff.borough.unique())

['BROOKLYN' nan 'BRONX' 'STATEN ISLAND' 'QUEENS' 'MANHATTAN']


In [45]:
dff = dff[dff['borough'].notna()]
dff['borough'] = dff['borough'].str.title()

In [46]:
# Create dictionary with key and value to link json file to dataframe
# Dictionary key must match a column value in dataframe 
nyc_boro_map = {}
for feature in nyc_boro['features']:
    feature['id'] = feature['properties']['cartodb_id']
    nyc_boro_map[feature['properties']['name']] = feature['id']


In [47]:
dff['boro_id'] = dff['borough'].apply(lambda x: nyc_boro_map[x])
#grouped_multiple = df.groupby(['Team', 'Pos']).agg({'Age': ['mean', 'min', 'max']})

In [48]:
df_boro_gb = dff.groupby(['borough','boro_id']).agg({'number_of_persons_killed': 'sum'}).reset_index()

In [49]:
df_boro_gb

Unnamed: 0,borough,boro_id,number_of_persons_killed
0,Bronx,5,81
1,Brooklyn,3,220
2,Manhattan,4,109
3,Queens,2,189
4,Staten Island,1,28


In [51]:
# Choropleth map of crashses involving fatalities (borough)
boro_fig = px.choropleth_mapbox(
    df_boro_gb,
    locations = "boro_id",
    geojson = nyc_boro,
    color = "number_of_persons_killed",
    #color_continuous_scale=px.colors.continuous.Viridis[::-1],
    #px.colors.sequential.Viridis
    hover_name="borough",
    #hover_data= ["Count"],
    mapbox_style="carto-positron",
    center={"lat": 40.730610, "lon": -73.9749},
    zoom=8.5,
    opacity=0.5,
    title = "NYC Boroughs",)
# fig.update_layout(
#     title={
#         'text': "location of fatalities",
#         'y':0.9,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'})

In [52]:
boro_fig.show()

# Visualization of Persons Killed in Collisions per Zip Code (2019 to 2022)

In [49]:
url = ('https://data.beta.nyc/dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')

In [50]:
nyc_zip = requests.get(url)

In [51]:
nyc_zip = nyc_zip.json()

In [52]:
# Geojson file to complete geographical features
nyc_zip['features'][1]['properties']

{'OBJECTID': 2,
 'postalCode': '11004',
 'PO_NAME': 'Glen Oaks',
 'STATE': 'NY',
 'borough': 'Queens',
 'ST_FIPS': '36',
 'CTY_FIPS': '081',
 'BLDGpostal': 0,
 '@id': 'http://nyc.pediacities.com/Resource/PostalCode/11004',
 'longitude': -73.711608312,
 'latitude': 40.745365835}

In [53]:
# Create dictionary with key and value to link json file to dataframe
zip_id_map = {}
for feature in nyc_zip['features']:
    feature['id'] = feature['properties']['OBJECTID']
    zip_id_map[feature['properties']['postalCode']] = feature['id']

In [61]:
# Make sure dictionary key and corresponding dataframe value are of the same type
#zip_id_map


In [54]:
# Filter out nan values of dataframe zip_code column
dff = dff[dff['zip_code'].notna()]

In [55]:
# Filter out list of zipcodes (not in geojson file) and 'nan' values
zipCodes = ['11249','10000','11695','nan','10179']
dff = dff[~dff['zip_code'].isin(zipCodes)]

In [56]:
dff['id'] = dff['zip_code'].apply(lambda x: zip_id_map[x])

In [57]:
df_p_killed_gb = dff.groupby(['zip_code'])['number_of_persons_killed'].sum().reset_index()

In [58]:
# Link dataFrame to json file
df_p_killed_gb['id'] = df_p_killed_gb['zip_code'].apply(lambda x: zip_id_map[x])

In [61]:
df_p_killed_gb.columns

Index(['zip_code', 'number_of_persons_killed', 'id'], dtype='object')

### Choropleth map collision fatalities

In [62]:
# Choropleth map using zip code boundaries (number of persons killed per zip code)
zip_fig = px.choropleth_mapbox(
    df_p_killed_gb,
    locations = "id",
    geojson = nyc_zip,
    color = "number_of_persons_killed",
    color_continuous_scale=px.colors.sequential.Inferno[::-1],
    hover_name="zip_code",
    #hover_data= ["count"],
    mapbox_style="carto-positron",
    center={"lat": 40.730610, "lon": -73.9749},
    zoom=8.5,
    opacity=0.5,
    title = "Zip Code Boundaries (Number of Persons Killed from 2019 to 2022)",)

In [63]:
zip_fig.show()