In [109]:
import pandas as pd
import numpy as np
from datetime import datetime

In [110]:
df = pd.read_csv("../data/raw/Motor_Vehicle_Collisions_-_Crashes.csv")

# drop useless columns
df = df.drop(columns=["ZIP CODE", "LOCATION", "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME", "COLLISION_ID"])

# convert "CRASH DATE" from object to datetime
df[["CRASH DATE"]] = pd.to_datetime(df[["CRASH DATE"]].stack()).unstack()

  df = pd.read_csv("../data/raw/Motor_Vehicle_Collisions_-_Crashes.csv")


In [111]:
# so that filtering out non-2019 crashes is a lot easier
df = df[df["CRASH DATE"] > datetime(2018, 12, 31)]
df = df[df["CRASH DATE"] < datetime(2020, 1, 1)]

# the time of the crash needs to be converted to a consistent format (e.g. 2:10 should be 02:10)
# the "CRASH TIME" column is located at index 1
for i in range(df.shape[0]):
    time = df.iat[i, 1]
    if len(time) == 4:
        time = f"0{time}"
        df.iat[i, 1] = time

df.head()


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,...,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
20,2019-05-21,22:50,BROOKLYN,40.69754,-73.98312,0.0,0.0,0,0,0,...,Passing or Lane Usage Improper,Unspecified,,,,�MBU,Taxi,,,
1059,2019-04-17,00:49,,40.651974,-73.86542,3.0,0.0,0,0,0,...,Following Too Closely,Unspecified,,,,Station Wagon/Sport Utility Vehicle,Sedan,,,
20389,2019-07-22,08:20,BROOKLYN,40.615433,-73.91388,0.0,0.0,0,0,0,...,Failure to Yield Right-of-Way,Failure to Yield Right-of-Way,,,,Sedan,Sedan,,,
33658,2019-10-19,17:20,,,,0.0,0.0,0,0,0,...,Unsafe Lane Changing,Unspecified,,,,Sedan,Sedan,,,
177966,2019-12-18,19:45,,,,1.0,0.0,1,0,0,...,Failure to Yield Right-of-Way,,,,,Sedan,,,,


In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211486 entries, 20 to 513118
Data columns (total 23 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   CRASH DATE                     211486 non-null  datetime64[ns]
 1   CRASH TIME                     211486 non-null  object        
 2   BOROUGH                        137024 non-null  object        
 3   LATITUDE                       194114 non-null  float64       
 4   LONGITUDE                      194114 non-null  float64       
 5   NUMBER OF PERSONS INJURED      211486 non-null  float64       
 6   NUMBER OF PERSONS KILLED       211486 non-null  float64       
 7   NUMBER OF PEDESTRIANS INJURED  211486 non-null  int64         
 8   NUMBER OF PEDESTRIANS KILLED   211486 non-null  int64         
 9   NUMBER OF CYCLIST INJURED      211486 non-null  int64         
 10  NUMBER OF CYCLIST KILLED       211486 non-null  int64         
 11 

In [113]:
# I've taken a look at all the distinct values for vehicle type codes, and vehicles involving taxis are labelled "Taxi"
#df["VEHICLE TYPE CODE 1"].unique()

# find out how many crashes involve taxis
total_crashes = df.shape[0]
df = df.loc[(df["VEHICLE TYPE CODE 1"] == "Taxi") | (df["VEHICLE TYPE CODE 2"] == "Taxi")| (df["VEHICLE TYPE CODE 3"] == "Taxi") \
    | (df["VEHICLE TYPE CODE 4"] == "Taxi") | (df["VEHICLE TYPE CODE 5"] == "Taxi")]
taxi_crashes = df.shape[0]
taxi_crashes_perc = f"{round(taxi_crashes / total_crashes * 100, 2)}%"

# find out how likely a taxi trip could end up in a crash
print(taxi_crashes / 84598444 * 100)

df.sample(5)

0.018769848769322518


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,...,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
405041,2019-06-16,05:20,MANHATTAN,40.761234,-73.96389,0.0,0.0,0,0,0,...,Unspecified,,,,,Taxi,,,,
384151,2019-06-24,14:25,,40.737553,-73.85089,0.0,0.0,0,0,0,...,Unsafe Lane Changing,Unspecified,,,,Station Wagon/Sport Utility Vehicle,Taxi,,,
345321,2019-08-23,09:50,,40.729176,-73.87898,0.0,0.0,0,0,0,...,Reaction to Uninvolved Vehicle,Following Too Closely,,,,Taxi,Tractor Truck Diesel,,,
475168,2019-02-23,23:50,MANHATTAN,40.742283,-74.004425,0.0,0.0,0,0,0,...,Driver Inattention/Distraction,Unspecified,,,,Taxi,Taxi,,,
451270,2019-03-16,18:00,,40.74708,-73.98135,0.0,0.0,0,0,0,...,Driver Inattention/Distraction,Unspecified,,,,Sedan,Taxi,,,


In [114]:
# look at how the number of instances changes after filtering out non-taxi crashes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15879 entries, 20 to 512654
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   CRASH DATE                     15879 non-null  datetime64[ns]
 1   CRASH TIME                     15879 non-null  object        
 2   BOROUGH                        10214 non-null  object        
 3   LATITUDE                       14734 non-null  float64       
 4   LONGITUDE                      14734 non-null  float64       
 5   NUMBER OF PERSONS INJURED      15879 non-null  float64       
 6   NUMBER OF PERSONS KILLED       15879 non-null  float64       
 7   NUMBER OF PEDESTRIANS INJURED  15879 non-null  int64         
 8   NUMBER OF PEDESTRIANS KILLED   15879 non-null  int64         
 9   NUMBER OF CYCLIST INJURED      15879 non-null  int64         
 10  NUMBER OF CYCLIST KILLED       15879 non-null  int64         
 11  NUMBER OF MOT

In [115]:
tdf = pd.read_csv("../data/raw/twilight.csv")

# only the civil twilight data will be used. Why? Because although it occurs after sunset/before sunrise, 
# the sky is still bright enough that it doesn't require artificial lighting. When civil ends and nautical twilight begins
# (or the opposite during sunrise), the sky is then dark enough to require artificial lighting.
tdf = tdf.drop(columns=["begin_nau", "end_nau", "begin_astro", "end_astro"])

# convert time to 24hr format
for i in range(tdf.shape[0]):
    # this always ends in AM
    begin = f"0{tdf.iat[i, -2][0:4]}"
    # this always ends in PM
    end = f"{str(int(tdf.iat[i, -1][0]) + 12)}{tdf.iat[i, -2][1:4]}"

    tdf.iat[i, -2] = begin
    tdf.iat[i, -1] = end

tdf

Unnamed: 0,date,begin_civ,end_civ
0,"Tue, January 1",06:49,17:49
1,"Wed, January 2",06:49,17:49
2,"Thu, January 3",06:49,17:49
3,"Fri, January 4",06:49,17:49
4,"Sat, January 5",06:49,17:49
...,...,...,...
360,"Fri, December 27",06:48,17:48
361,"Sat, December 28",06:48,17:48
362,"Sun, December 29",06:48,17:48
363,"Mon, December 30",06:48,17:48


# DATA PREPROCESSING
## 1: Month
## 2: Daylight

In [116]:
# final data frame
fdf = df.iloc[:, [0, 1, 2, 3, 4]].copy().sort_values(by=["CRASH DATE", "CRASH TIME"])
fdf.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE
498225,2019-01-01,00:00,,40.74147,-73.985435
508302,2019-01-01,00:04,BRONX,40.857708,-73.9043
510764,2019-01-01,01:00,,40.699158,-73.927055
500366,2019-01-01,01:10,MANHATTAN,40.73736,-73.99685
512654,2019-01-01,01:10,,,


In [117]:
# from the time data, extract info about whether it is night or day
# AND from the date data, extract info about its month
crash_time = fdf.iloc[:, 1].to_numpy()
daylight_start = tdf.iloc[:, -2].to_numpy()
daylight_end = tdf.iloc[:, -1].to_numpy()

month = []
daylight = []
for i in range(fdf.shape[0]):
    # DAYLIGHT
    # fdf.iloc[i, 0] is the crash date
    # yday = 0 for 1 Jan, or = 1 for 2 Jan, or = 364 for 31 Dec, etc, 
    # which will be used as the index for the twilight data
    yday = fdf.iloc[i, 0].timetuple().tm_yday - 1

    if (crash_time[i] < daylight_start[yday]) or (crash_time[i] > daylight_end[yday]):
        daylight.append("night")
    else:
        daylight.append("day")

    # MONTH
    month.append(fdf.iloc[i, 0].timetuple().tm_mon)

fdf["month"] = month
fdf["daylight"] = daylight

fdf.head(3)

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,month,daylight
498225,2019-01-01,00:00,,40.74147,-73.985435,1,night
508302,2019-01-01,00:04,BRONX,40.857708,-73.9043,1,night
510764,2019-01-01,01:00,,40.699158,-73.927055,1,night


In [118]:
fdf["daylight"].value_counts()

day      9754
night    6125
Name: daylight, dtype: int64

It seems like crashes occur more during the day than it is during the night. However, we have to take into account that the number of vehicles outside is much higher during the day than night.

In [119]:
df["CRASH TIME"].value_counts()

00:00    228
14:00    165
17:00    153
13:00    148
16:00    148
        ... 
04:29      1
03:31      1
03:56      1
04:56      1
02:42      1
Name: CRASH TIME, Length: 1390, dtype: int64

This one is so funny, I guess the police or whoever's responsible filling in the crash time loves a nice even number. I mean, I can't blame them, I love even numbers too.

The outlier seems to be 00:00, which is probably because the crashes whose time are not recorded are automatically allocated to 00:00.

## 3: Sleep

In [120]:
# from the time data, extract info about whether it is during sleeping or waking hour
sleeping_hour = []
for i in range(fdf.shape[0]):
    if (crash_time[i] < "06:00") or (crash_time[i] > "23:00"):
        sleeping_hour.append("yes")
    else:
        sleeping_hour.append("no")

fdf["sleeping_hour"] = sleeping_hour
fdf.sample(3)

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,month,daylight,sleeping_hour
357748,2019-07-25,21:04,MANHATTAN,40.778904,-73.96024,7,night,no
382442,2019-07-06,16:30,BROOKLYN,40.66819,-73.91038,7,day,no
446656,2019-04-03,08:30,QUEENS,40.72136,-73.837036,4,day,no


## 4: Number of victims
## 5: Number of vehicles involved

In [121]:
victims = []
vehicles = []
for i in range(df.shape[0]):
    # the number of death is not a subset of the number of injuries, so these two need to be summed up
    victims.append(int(df.iloc[i, 5] + df.iloc[i, 6]))

    # the number of vehicles involved is how many non-NaN values in the columns "VEHICLE TYPE CODE N" from N = 1 to 5.
    count = 0
    for j in range(-5, 0):
        if not pd.isnull(df.iloc[i, j]):
            count += 1
    vehicles.append(count)
    

fdf["victims"] = victims
fdf["vehicles"] = vehicles

fdf.sample(3)    

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,month,daylight,sleeping_hour,victims,vehicles
489020,2019-02-02,21:30,,40.7374,-73.931915,2,night,no,0,2
306835,2019-09-18,14:35,BRONX,40.85183,-73.91258,9,day,no,1,2
418030,2019-05-18,17:45,MANHATTAN,40.721622,-74.00221,5,day,no,0,2


In [122]:
fdf["victims"].value_counts()

0    12501
1     2557
2      532
3      177
4       66
5       25
6       10
7        5
8        5
9        1
Name: victims, dtype: int64

In [123]:
fdf["vehicles"].value_counts()
fdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15879 entries, 498225 to 221780
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   CRASH DATE     15879 non-null  datetime64[ns]
 1   CRASH TIME     15879 non-null  object        
 2   BOROUGH        10214 non-null  object        
 3   LATITUDE       14734 non-null  float64       
 4   LONGITUDE      14734 non-null  float64       
 5   month          15879 non-null  int64         
 6   daylight       15879 non-null  object        
 7   sleeping_hour  15879 non-null  object        
 8   victims        15879 non-null  int64         
 9   vehicles       15879 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 1.3+ MB


## 6: Location

In [124]:
import geopandas as gpd

# from tute2
sf = gpd.read_file("../data/raw/taxi_zones/taxi_zones.shp")
zones = pd.read_csv("../data/raw/taxi_zones/taxi+_zone_lookup.csv")
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(
    pd.merge(zones, sf, on='LocationID', how='inner')
)

gdf.head(3)

Unnamed: 0,LocationID,Borough,Zone,service_zone,OBJECTID,Shape_Leng,Shape_Area,zone,borough,geometry
0,1,EWR,Newark Airport,EWR,1,0.116357,0.000782,Newark Airport,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,Queens,Jamaica Bay,Boro Zone,2,0.43347,0.004866,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,3,0.084341,0.000314,Allerton/Pelham Gardens,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."


In [125]:
# create POINT() geometry for the longitude and latitude of each crash
fdf = gpd.GeoDataFrame(
    fdf, geometry=gpd.points_from_xy(fdf.LONGITUDE, fdf.LATITUDE))
fdf.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,month,daylight,sleeping_hour,victims,vehicles,geometry
498225,2019-01-01,00:00,,40.74147,-73.985435,1,night,yes,0,2,POINT (-73.98543 40.74147)
508302,2019-01-01,00:04,BRONX,40.857708,-73.9043,1,night,yes,0,2,POINT (-73.90430 40.85771)
510764,2019-01-01,01:00,,40.699158,-73.927055,1,night,yes,0,2,POINT (-73.92705 40.69916)
500366,2019-01-01,01:10,MANHATTAN,40.73736,-73.99685,1,night,yes,0,2,POINT (-73.99685 40.73736)
512654,2019-01-01,01:10,,,,1,night,yes,0,2,POINT EMPTY


In [126]:
# match every point to the zone and borough it belongs
gdf = gdf.iloc[:, [-3, -2, -1]]
fdf_new = gpd.sjoin(fdf, gdf, how="left", predicate='within')
fdf_new.head()

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs + ...

  fdf_new = gpd.sjoin(fdf, gdf, how="left", predicate='within')


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,month,daylight,sleeping_hour,victims,vehicles,geometry,index_right,zone,borough
498225,2019-01-01,00:00,,40.74147,-73.985435,1,night,yes,0,2,POINT (-73.98543 40.74147),233.0,Union Sq,Manhattan
508302,2019-01-01,00:04,BRONX,40.857708,-73.9043,1,night,yes,0,2,POINT (-73.90430 40.85771),93.0,Fordham South,Bronx
510764,2019-01-01,01:00,,40.699158,-73.927055,1,night,yes,0,2,POINT (-73.92705 40.69916),36.0,Bushwick South,Brooklyn
500366,2019-01-01,01:10,MANHATTAN,40.73736,-73.99685,1,night,yes,0,2,POINT (-73.99685 40.73736),248.0,West Village,Manhattan
512654,2019-01-01,01:10,,,,1,night,yes,0,2,POINT EMPTY,,,


In [151]:
# all that's left to do is remove unnecessary variables after the join and a little bit more cleanup.
fdf = fdf_new.drop(columns=["BOROUGH", "LATITUDE", "LONGITUDE", "geometry"])
fdf = fdf.rename(columns={"CRASH DATE": "date", "CRASH TIME": "time", "index_right": "LocationID"})
fdf["LocationID"] = fdf["LocationID"].astype("Int64")
fdf.head()


Unnamed: 0,date,time,month,daylight,sleeping_hour,victims,vehicles,LocationID,zone,borough
498225,2019-01-01,00:00,1,night,yes,0,2,233.0,Union Sq,Manhattan
508302,2019-01-01,00:04,1,night,yes,0,2,93.0,Fordham South,Bronx
510764,2019-01-01,01:00,1,night,yes,0,2,36.0,Bushwick South,Brooklyn
500366,2019-01-01,01:10,1,night,yes,0,2,248.0,West Village,Manhattan
512654,2019-01-01,01:10,1,night,yes,0,2,,,


## Wrapping up

In [153]:
# save fdf to csv
fdf.to_csv("../data/curated/crime.csv", index=False)