In [1]:
import json
import requests
import pandas as pd
from sodapy import Socrata
import numpy as np
import urllib.request
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'browser'
from urllib.request import urlopen
#import plotly.graph_objs as go
#from urllib.request import urlopen

# Introduction

This notebook aims to explore trends in recent data on motor vehicle collisions. Project members can have a fuller understanding of the content, timing, frequency, and location of collisions. This notebook seeks to contribute to that understanding through the exploratory analysis and visualization of vehicle collision data. 

# Data Source

The data used in this notebook was obtained from: 

- [NYC Open Data's Motor Vehicle Collision-Crashes](https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95)
  - This dataset contains information from all police reported motor vehicle collisions in NYC. Each row represents a crash event.The police report (MV104-AN) is required to be filled out for collisions where someone is injured or killed, or where there is at least 1000 dollars worth of damage. This notebook uses a subset of the data and was accessed with the [Socrata Open Data (SODA) API](https://dev.socrata.com/consumers/getting-started.html). 
 

# Accessing Data
Items needed to obtain data:
- Create app token
- Get domain name
- Get data identifier

In [2]:
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'h9gi-nx95'
client = Socrata(socrata_domain, app_token="zrfGJViqm8hhU7PFyRqvPZcyF")
client.timeout = 1000

In [3]:
# Pull data from Socrata API
results = client.get(socrata_dataset_identifier, limit = 400000)


In [4]:
# Convert data into a pandas dataframe
df = pd.DataFrame(results)

In [6]:
dff = df.copy()

In [7]:
dff.shape

(400000, 29)

In [10]:
dff.head()

Unnamed: 0,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,longitude,location,cross_street_name,off_street_name,contributing_factor_vehicle_3,contributing_factor_vehicle_4,vehicle_type_code_3,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-04-14T00:00:00.000,5:32,BRONX WHITESTONE BRIDGE,0,0,0,0,0,0,0,...,,,,,,,,,,
1,2021-04-13T00:00:00.000,21:35,,1,0,1,0,0,0,0,...,-73.97617,"{'latitude': '40.68358', 'longitude': '-73.976...",620 ATLANTIC AVENUE,,,,,,,
2,2021-04-15T00:00:00.000,16:15,HUTCHINSON RIVER PARKWAY,0,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-04-13T00:00:00.000,16:00,VANDERVORT AVENUE,0,0,0,0,0,0,0,...,,,,ANTHONY STREET,,,,,,
4,2021-04-12T00:00:00.000,8:25,EDSON AVENUE,0,0,0,0,0,0,0,...,0.0,"{'latitude': '0.0', 'longitude': '0.0'}",,,,,,,,


# Data Cleaning

In [26]:
type(dff)

pandas.core.frame.DataFrame

In [8]:
dff.isna().sum()

crash_date                            0
crash_time                            0
on_street_name                   102346
number_of_persons_injured             1
number_of_persons_killed              0
number_of_pedestrians_injured         0
number_of_pedestrians_killed          0
number_of_cyclist_injured             0
number_of_cyclist_killed              0
number_of_motorist_injured            0
number_of_motorist_killed             0
contributing_factor_vehicle_1      1682
contributing_factor_vehicle_2     78634
collision_id                          0
vehicle_type_code1                 3631
vehicle_type_code2               110155
borough                          139555
zip_code                         139607
latitude                          32661
longitude                         32661
location                          32661
cross_street_name                297675
off_street_name                  209335
contributing_factor_vehicle_3    365486
contributing_factor_vehicle_4    391268


In [24]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   crash_date                     400000 non-null  object
 1   crash_time                     400000 non-null  object
 2   on_street_name                 297654 non-null  object
 3   number_of_persons_injured      399999 non-null  object
 4   number_of_persons_killed       400000 non-null  object
 5   number_of_pedestrians_injured  400000 non-null  object
 6   number_of_pedestrians_killed   400000 non-null  object
 7   number_of_cyclist_injured      400000 non-null  object
 8   number_of_cyclist_killed       400000 non-null  object
 9   number_of_motorist_injured     400000 non-null  object
 10  number_of_motorist_killed      400000 non-null  object
 11  contributing_factor_vehicle_1  398318 non-null  object
 12  contributing_factor_vehicle_2  321366 non-nu

In [35]:
# Exclude unnecessary columns
dff = dff[['crash_date', 'crash_time','number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured', 'number_of_pedestrians_killed', 'number_of_cyclist_injured','number_of_cyclist_killed', 'number_of_motorist_injured', 'number_of_motorist_killed', 'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2', 'collision_id', 'vehicle_type_code1', 'vehicle_type_code2', 'borough', 'zip_code', 'latitude', 'longitude', 'on_street_name']]


In [36]:
# Eliminate rows witout lat/lon 
dff = dff[dff.longitude.notnull()]
dff = dff[dff.latitude.notnull()]

In [37]:
# Replace borough NaN values with string
dff.borough = df.borough.fillna('Not recorded')


In [42]:
# Convert columns to numeric values
numeric_convert_list = ['number_of_persons_injured','number_of_persons_killed','number_of_pedestrians_injured','number_of_pedestrians_killed','number_of_cyclist_injured','number_of_cyclist_killed','number_of_motorist_injured','number_of_motorist_killed','latitude','longitude']
#print(numeric_convert_list)
dff[numeric_convert_list] = dff[numeric_convert_list].apply(pd.to_numeric)
# Column'number_of_persons_injured' converted to float64






The crash_date columns currently contains string values

In [38]:
type(dff.crash_date[1])

str

The pandas.to_datetime() method converts each string value to a datetime object below

In [39]:
dff['crash_date'] = pd.to_datetime(dff['crash_date'], format='%Y-%m-%dT%H:%M:%S.%f')

In [40]:
# Create year, month, day_of_week columns
dff['year'] = dff['crash_date'].dt.year
dff['month'] = [int(i.month) for i in dff.crash_date]
dff['day_of_week'] = [int(i.weekday()) for i in dff.crash_date]
# Create hour column
dff['hour'] = pd.to_datetime(dff['crash_time'], format='%H:%M').dt.hour

In [43]:
dff.dtypes

crash_date                       datetime64[ns]
crash_time                               object
number_of_persons_injured               float64
number_of_persons_killed                  int64
number_of_pedestrians_injured             int64
number_of_pedestrians_killed              int64
number_of_cyclist_injured                 int64
number_of_cyclist_killed                  int64
number_of_motorist_injured                int64
number_of_motorist_killed                 int64
contributing_factor_vehicle_1            object
contributing_factor_vehicle_2            object
collision_id                             object
vehicle_type_code1                       object
vehicle_type_code2                       object
borough                                  object
zip_code                                 object
latitude                                float64
longitude                               float64
on_street_name                           object
year                                    

In [45]:
dff.describe()

Unnamed: 0,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,latitude,longitude,year,month,day_of_week,hour
count,367338.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0,367339.0
mean,0.374748,0.001936,0.060995,0.000882,0.039103,0.000182,0.268659,0.000836,40.567104,-73.624072,2019.949676,6.841852,2.939533,13.070101
std,0.734205,0.046069,0.251112,0.029869,0.197044,0.013504,0.698337,0.031769,2.542645,4.613007,0.910148,3.345443,1.959798,5.985283
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-74.253006,2012.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.66743,-73.962135,2019.0,4.0,1.0,9.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.717884,-73.91892,2020.0,7.0,3.0,14.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.78168,-73.86325,2021.0,10.0,5.0,18.0
max,20.0,4.0,7.0,2.0,3.0,1.0,20.0,4.0,40.912884,0.0,2022.0,12.0,6.0,23.0


In [None]:
points = dff.contributing_factor_vehicle_1.unique()

In [137]:
points

array(['Following Too Closely', 'Unspecified', 'Pavement Slippery',
       'Driver Inattention/Distraction', 'Other Vehicular',
       'Passing Too Closely', 'Passing or Lane Usage Improper',
       'Driver Inexperience', 'Failure to Yield Right-of-Way',
       'Brakes Defective', 'Turning Improperly', 'Unsafe Speed',
       'Backing Unsafely', 'Reaction to Uninvolved Vehicle',
       'View Obstructed/Limited', 'Steering Failure',
       'Traffic Control Disregarded', 'Drugs (illegal)',
       'Aggressive Driving/Road Rage', 'Fell Asleep',
       'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
       'Alcohol Involvement', 'Unsafe Lane Changing',
       'Pavement Defective', 'Other Lighting Defects',
       'Oversized Vehicle', 'Animals Action', 'Outside Car Distraction',
       'Illnes', 'Driverless/Runaway Vehicle', 'Passenger Distraction',
       'Tire Failure/Inadequate', nan, 'Lost Consciousness',
       'Accelerator Defective', 'Obstruction/Debris',
       'Failure to Ke

In [33]:
# 
dff.borough.unique()
#s.str.title()

array(['BROOKLYN', 'Not recorded', 'BRONX', 'STATEN ISLAND', 'QUEENS',
       'MANHATTAN'], dtype=object)

In [34]:
dff['borough'] = dff['borough'].str.title()
dff.borough.unique()

array(['Brooklyn', 'Not Recorded', 'Bronx', 'Staten Island', 'Queens',
       'Manhattan'], dtype=object)

In [47]:
dff.isnull().sum()

crash_date                            0
crash_time                            0
number_of_persons_injured             1
number_of_persons_killed              0
number_of_pedestrians_injured         0
number_of_pedestrians_killed          0
number_of_cyclist_injured             0
number_of_cyclist_killed              0
number_of_motorist_injured            0
number_of_motorist_killed             0
contributing_factor_vehicle_1      1585
contributing_factor_vehicle_2     72808
collision_id                          0
vehicle_type_code1                 3452
vehicle_type_code2               102516
borough                               0
zip_code                         114569
latitude                              0
longitude                             0
on_street_name                    99127
year                                  0
month                                 0
day_of_week                           0
hour                                  0
dtype: int64

#### Review vehicle type values 

In [51]:
dff.vehicle_type_code1.unique()

array(['Sedan', 'Station Wagon/Sport Utility Vehicle', '�MBU',
       'Pick-up Truck', 'Box Truck', nan, 'Ambulance',
       'Tow Truck / Wrecker', 'Taxi', 'E-Bike', 'Van', 'Flat Bed',
       'Moped', 'Tractor Truck Diesel', 'AMBULANCE', 'Trailer', 'Bus',
       'Motorcycle', 'Garbage or Refuse', 'SCHOOL BUS', 'Lift Boom',
       'Bike', 'scooter', 'CATER', 'PKUP', 'FDNY Ambul', '3-Door',
       'Beverage Truck', 'SEMI TRAIL', 'Dump', 'Convertible',
       '4 dr sedan', 'dump', 'Flat Rack', 'Carry All', 'GARBAGE TR',
       'Motorbike', 'Tanker', 'Armored Truck', 'MOPED',
       'Bulk Agriculture', 'Motorscooter', 'UTILITY', 'Concrete Mixer',
       'Open Body', 'E-Scooter', 'unknown', 'fire truck', 'Glass Rack',
       'COMMERCIAL', 'PSD', 'Scooter', 'MTA', 'Multi-Wheeled Vehicle',
       'PK', 'Minicycle', 'Truck', 'Chassis Cab', 'LIMO',
       'Tractor Truck Gasoline', 'MACK', 'Refrigerated Van', 'CARRIER',
       'Fire truck', 'TRUCK', 'ambulance', 'NYPD VAN', '2 dr sedan',
       

In [None]:
# vehicle_type_code1 ( ckeck values and categories) 
# fdny truck, FDNY TRUCK, fire truck,fdny truck, Fire truck, FDNY, FIRE TRUCK, FIRETRUCK,FDNY FIRE, firetruck, Fire Truck, FDNY TRUCK, FIRET TRUC, DNY FIRET   
# Motorcycle, Motorbike, Bike, Dirt Bike
# AMBULANCE, FDNY Ambul, AMBULANE, ambulance, FDNY AMBUL, NYS Ambula, EMS, Ambu, AMBU, AMBUKANCE, EMS Ambula, NYC AMBULA, FDNY EMS, AMBULACE, almbulance, ambul
# 

### Check columns with missing values (Replace missing values)

In [57]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 367339 entries, 1 to 399999
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_date                     367339 non-null  datetime64[ns]
 1   crash_time                     367339 non-null  object        
 2   number_of_persons_injured      367338 non-null  float64       
 3   number_of_persons_killed       367339 non-null  int64         
 4   number_of_pedestrians_injured  367339 non-null  int64         
 5   number_of_pedestrians_killed   367339 non-null  int64         
 6   number_of_cyclist_injured      367339 non-null  int64         
 7   number_of_cyclist_killed       367339 non-null  int64         
 8   number_of_motorist_injured     367339 non-null  int64         
 9   number_of_motorist_killed      367339 non-null  int64         
 10  contributing_factor_vehicle_1  365754 non-null  object        
 11  

#### The on_street_name  conlumn's missing cells will be replace by string 'Not described'


In [54]:
dff.on_street_name.fillna(value='Not described',inplace=True)

#### The column contributing_factor_vehicle_1's missing values  will be replace with most common value

In [58]:
dff.contributing_factor_vehicle_1.value_counts().head()

Driver Inattention/Distraction    93948
Unspecified                       91937
Following Too Closely             25388
Failure to Yield Right-of-Way     25208
Passing or Lane Usage Improper    15277
Name: contributing_factor_vehicle_1, dtype: int64

In [59]:
dff.contributing_factor_vehicle_1.fillna(value='Driver Inattention/Distraction',inplace=True)

In [60]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 367339 entries, 1 to 399999
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_date                     367339 non-null  datetime64[ns]
 1   crash_time                     367339 non-null  object        
 2   number_of_persons_injured      367338 non-null  float64       
 3   number_of_persons_killed       367339 non-null  int64         
 4   number_of_pedestrians_injured  367339 non-null  int64         
 5   number_of_pedestrians_killed   367339 non-null  int64         
 6   number_of_cyclist_injured      367339 non-null  int64         
 7   number_of_cyclist_killed       367339 non-null  int64         
 8   number_of_motorist_injured     367339 non-null  int64         
 9   number_of_motorist_killed      367339 non-null  int64         
 10  contributing_factor_vehicle_1  367339 non-null  object        
 11  

#### Replace vehicle 1 type with most common vehicle type

In [65]:
dff.vehicle_type_code1.value_counts().head()

Sedan                                  172399
Station Wagon/Sport Utility Vehicle    134161
Taxi                                    13200
Pick-up Truck                            9193
Box Truck                                6926
Name: vehicle_type_code1, dtype: int64

In [63]:
dff.vehicle_type_code1.fillna(value='Sedan',inplace=True)

In [64]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 367339 entries, 1 to 399999
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_date                     367339 non-null  datetime64[ns]
 1   crash_time                     367339 non-null  object        
 2   number_of_persons_injured      367338 non-null  float64       
 3   number_of_persons_killed       367339 non-null  int64         
 4   number_of_pedestrians_injured  367339 non-null  int64         
 5   number_of_pedestrians_killed   367339 non-null  int64         
 6   number_of_cyclist_injured      367339 non-null  int64         
 7   number_of_cyclist_killed       367339 non-null  int64         
 8   number_of_motorist_injured     367339 non-null  int64         
 9   number_of_motorist_killed      367339 non-null  int64         
 10  contributing_factor_vehicle_1  367339 non-null  object        
 11  

#  Overview of Vehicles Involved in Collitions



Now lets compute the percentage of vehicle types involve in crashes
- Vehicle 1

In [29]:
v1 = dff.vehicle_type_code1
counts = v1.value_counts()
percent = v1.value_counts(normalize=True)
percent100 = percent.mul(100).round(1).astype(str) + '%'
vehicle1_df = pd.DataFrame({'counts': counts, 'percent': percent, 'percent100': percent100})

In [30]:
vehicle1_df.head(10)

Unnamed: 0,counts,percent,percent100
Sedan,168934,0.464237,46.4%
Station Wagon/Sport Utility Vehicle,134169,0.368701,36.9%
Taxi,13216,0.036318,3.6%
Pick-up Truck,9198,0.025276,2.5%
Box Truck,6932,0.019049,1.9%
Bus,5798,0.015933,1.6%
Bike,4031,0.011077,1.1%
Tractor Truck Diesel,2662,0.007315,0.7%
Motorcycle,2452,0.006738,0.7%
Van,2249,0.00618,0.6%


- Vehicle 2 

In [31]:
v2 = dff.vehicle_type_code2
# s = df.keywords
counts2 = v2.value_counts()
percent2 = v2.value_counts(normalize=True)
percent2_100 = v2.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
vehicle2_df = pd.DataFrame({'counts': counts2, 'percent': percent2, 'percent100': percent2_100})

In [32]:
vehicle2_df.head(10)

Unnamed: 0,counts,percent,percent100
Sedan,112003,0.422771,42.3%
Station Wagon/Sport Utility Vehicle,90620,0.342058,34.2%
Bike,10778,0.040683,4.1%
Taxi,8349,0.031514,3.2%
Box Truck,8088,0.030529,3.1%
Pick-up Truck,7974,0.030099,3.0%
Bus,4984,0.018813,1.9%
Tractor Truck Diesel,2637,0.009954,1.0%
E-Bike,2435,0.009191,0.9%
Motorcycle,2131,0.008044,0.8%


# Visualization of collisions with fatalities in 2021

In [42]:
# Condition to filter year and records with person killed
fatal_2021_cond = (dff['year'] == 2021 ) & (dff['number_of_persons_killed']>=1)

In [43]:
# Create new DataFrame that meets conditions
df_2021_p_killed = dff[fatal_2021_cond]

In [44]:
df_2021_p_killed.head()

Unnamed: 0,crash_date,crash_time,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,...,zip_code,latitude,longitude,cross_street_name,on_street_name,number_persons_injured,year,month,day_of_week,hour
151,2021-04-15,15:18,1,0,1,0,0,0,0,Driver Inattention/Distraction,...,11209.0,40.620487,-74.029305,,4 AVENUE,0,2021,4,3,15
1156,2021-04-18,23:56,1,0,1,0,0,0,0,Unspecified,...,,40.770527,-73.98012,,CENTRAL PARK WEST,0,2021,4,6,23
1350,2021-04-08,19:55,1,0,0,0,0,0,0,Driver Inexperience,...,10459.0,40.830307,-73.89873,1281 UNION AVENUE,,0,2021,4,3,19
1650,2021-04-19,19:13,1,0,1,0,0,0,0,Driver Inattention/Distraction,...,10039.0,40.82289,-73.94208,300 WEST 145 STREET,,0,2021,4,0,19
2061,2021-04-22,8:01,1,0,1,0,0,0,0,Failure to Yield Right-of-Way,...,10035.0,40.803698,-73.937912,,EAST 124 STREET,0,2021,4,3,8


In [45]:
px.set_mapbox_access_token(open(".mapbox_token").read())

In [46]:
# Scatter plot of crash with fatalities (color by hour and size by number of persons killed)
p_killed_2021_fig = px.scatter_mapbox(df_2021_p_killed,
                                      lat="latitude",
                                      lon="longitude",
                                      color="hour",
                                      size="number_of_persons_killed",
                                      #hover_data = "off_street_name",
                                      mapbox_style="streets",
                                      color_continuous_scale=px.colors.cyclical.IceFire,
                                      size_max=15,
                                      zoom=10,
                                      title = "Crash Reports with Fatalities in 2021",)

In [47]:
p_killed_2021_fig.show()

# Visualization of Collition Resulting in Persons Injured (2021)

In [48]:
# Condition includes year 2021 and events with persons injured
df_injured_cond = (dff['year'] == 2021 ) & (dff['number_persons_injured']>=1)


In [49]:
# New dataFrame includes person injured from collision (2021)
df_2021_p_injured = dff[df_injured_cond]

In [50]:
# Scatter plot of crash with injuries (color by hour and size by number of persons injured)
p_injured_2021_fig = px.scatter_mapbox(df_2021_p_injured,
                                       lat="latitude",
                                       lon="longitude",
                                       color="hour",
                                       size="number_persons_injured",
                                       color_continuous_scale=px.colors.cyclical.IceFire,
                                       size_max=15,
                                       zoom=10,
                                       center={"lat": 40.730610, "lon": -73.9749},
                                       title = "Crash Reports with Persons Injured in 2021",)
p_injured_2021_fig.show()

# Visualization of Collitions by Borough

In [41]:
boro_url = ('https://raw.githubusercontent.com/codeforgermany/click_that_hood/main/public/data/new-york-city-boroughs.geojson')

In [42]:
nyc_boro = requests.get(boro_url)
nyc_boro = nyc_boro.json()

In [43]:
nyc_boro['features'][1]['properties']


{'name': 'Queens',
 'cartodb_id': 2,
 'created_at': '2013-03-09T02:42:03.692Z',
 'updated_at': '2013-03-09T02:42:03.989Z'}

In [44]:
print(dff.borough.unique())

['BROOKLYN' nan 'BRONX' 'STATEN ISLAND' 'QUEENS' 'MANHATTAN']


In [45]:
dff = dff[dff['borough'].notna()]
dff['borough'] = dff['borough'].str.title()

In [46]:
# Create dictionary with key and value to link json file to dataframe
# Dictionary key must match a column value in dataframe 
nyc_boro_map = {}
for feature in nyc_boro['features']:
    feature['id'] = feature['properties']['cartodb_id']
    nyc_boro_map[feature['properties']['name']] = feature['id']


In [47]:
dff['boro_id'] = dff['borough'].apply(lambda x: nyc_boro_map[x])
#grouped_multiple = df.groupby(['Team', 'Pos']).agg({'Age': ['mean', 'min', 'max']})

In [48]:
df_boro_gb = dff.groupby(['borough','boro_id']).agg({'number_of_persons_killed': 'sum'}).reset_index()

In [49]:
df_boro_gb

Unnamed: 0,borough,boro_id,number_of_persons_killed
0,Bronx,5,81
1,Brooklyn,3,220
2,Manhattan,4,109
3,Queens,2,189
4,Staten Island,1,28


In [51]:
# Choropleth map of crashses involving fatalities (borough)
boro_fig = px.choropleth_mapbox(
    df_boro_gb,
    locations = "boro_id",
    geojson = nyc_boro,
    color = "number_of_persons_killed",
    #color_continuous_scale=px.colors.continuous.Viridis[::-1],
    #px.colors.sequential.Viridis
    hover_name="borough",
    #hover_data= ["Count"],
    mapbox_style="carto-positron",
    center={"lat": 40.730610, "lon": -73.9749},
    zoom=8.5,
    opacity=0.5,
    title = "NYC Boroughs",)
# fig.update_layout(
#     title={
#         'text': "location of fatalities",
#         'y':0.9,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'})

In [52]:
boro_fig.show()

# Visualization of Persons Killed in Collisions per Zip Code (2019 to 2022)

In [49]:
url = ('https://data.beta.nyc/dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')

In [50]:
nyc_zip = requests.get(url)

In [51]:
nyc_zip = nyc_zip.json()

In [52]:
# Geojson file to complete geographical features
nyc_zip['features'][1]['properties']

{'OBJECTID': 2,
 'postalCode': '11004',
 'PO_NAME': 'Glen Oaks',
 'STATE': 'NY',
 'borough': 'Queens',
 'ST_FIPS': '36',
 'CTY_FIPS': '081',
 'BLDGpostal': 0,
 '@id': 'http://nyc.pediacities.com/Resource/PostalCode/11004',
 'longitude': -73.711608312,
 'latitude': 40.745365835}

In [53]:
# Create dictionary with key and value to link json file to dataframe
zip_id_map = {}
for feature in nyc_zip['features']:
    feature['id'] = feature['properties']['OBJECTID']
    zip_id_map[feature['properties']['postalCode']] = feature['id']

In [61]:
# Make sure dictionary key and corresponding dataframe value are of the same type
#zip_id_map


In [54]:
# Filter out nan values of dataframe zip_code column
dff = dff[dff['zip_code'].notna()]

In [55]:
# Filter out list of zipcodes (not in geojson file) and 'nan' values
zipCodes = ['11249','10000','11695','nan','10179']
dff = dff[~dff['zip_code'].isin(zipCodes)]

In [56]:
dff['id'] = dff['zip_code'].apply(lambda x: zip_id_map[x])

In [57]:
df_p_killed_gb = dff.groupby(['zip_code'])['number_of_persons_killed'].sum().reset_index()

In [58]:
# Link dataFrame to json file
df_p_killed_gb['id'] = df_p_killed_gb['zip_code'].apply(lambda x: zip_id_map[x])

In [61]:
df_p_killed_gb.columns

Index(['zip_code', 'number_of_persons_killed', 'id'], dtype='object')

### Choropleth map collision fatalities

In [62]:
# Choropleth map using zip code boundaries (number of persons killed per zip code)
zip_fig = px.choropleth_mapbox(
    df_p_killed_gb,
    locations = "id",
    geojson = nyc_zip,
    color = "number_of_persons_killed",
    color_continuous_scale=px.colors.sequential.Inferno[::-1],
    hover_name="zip_code",
    #hover_data= ["count"],
    mapbox_style="carto-positron",
    center={"lat": 40.730610, "lon": -73.9749},
    zoom=8.5,
    opacity=0.5,
    title = "Zip Code Boundaries (Number of Persons Killed from 2019 to 2022)",)

In [63]:
zip_fig.show()

# Distribution of Collisions per Time Periods

In [45]:
#boro_df = dff_2021[dff_2021['boro_nm'] == boro_selected]
dff['month'] = dff['month'].apply(pd.to_numeric)

In [19]:
dff.columns

Index(['crash_date', 'crash_time', 'number_of_persons_killed',
       'number_of_pedestrians_injured', 'number_of_pedestrians_killed',
       'number_of_cyclist_injured', 'number_of_cyclist_killed',
       'number_of_motorist_injured', 'number_of_motorist_killed',
       'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
       'collision_id', 'vehicle_type_code1', 'contributing_factor_vehicle_1',
       'vehicle_type_code2', 'contributing_factor_vehicle_2', 'borough',
       'zip_code', 'latitude', 'longitude', 'cross_street_name',
       'on_street_name', 'number_persons_injured', 'year', 'month',
       'day_of_week', 'hour'],
      dtype='object')

In [20]:
df_2021 = dff[dff["year"]==2021]

In [None]:
#df_gb= df.groupby(['Month','susp_sex','boro_nm'],as_index=False)['Complaint Count'].count()

In [46]:
accidents_months = dff.groupby(['month','year'], as_index = False)['collision_id'].count()

In [41]:
#df = df.sort_values(by='Value', ascending=False)
#accidents_months = accidents_months.sort_values(by = "month")

In [47]:
accidents_months.dtypes

month           int64
year            int64
collision_id    int64
dtype: object

In [None]:
# fig = px.line(df, x="year", y="lifeExp", color='country')
# fig.show()

In [48]:
line_fig = px.line(accidents_months, x = 'month', y= 'collision_id', color = 'year')

In [49]:
line_fig.show()

In [31]:
# barchart = px.bar(
#         accidents_months,
#         x='month',
#         y='collision_id',
#         #color="susp_sex",
#         opacity=0.9,                  # set opacity of markers (from 0 to 1)
#         orientation="v",              # 'v','h': orientation of the marks
#         #barmode='group',
#         template='gridon',
#         #labels={"susp_sex":"Suspect Gender"},
#         #title=("Monthly Reports in %s" % boro_selected)
# )
#     barchart.update_layout(title={'xanchor': 'center', 'yanchor': 'top', 'y': 0.9, 'x': 0.5,},
#                            legend = dict(orientation= "h",
#                            yanchor="bottom",
#                            y=1.02,
#                            xanchor="right",
#                            x=1),
#                           )

In [32]:
#barchart.show()