# Imports

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils import create_geometry_column, find_similar_phrases, normalize_street_names, map_vehicle_type
from mapping import raw_mapping, generalized_mapping

# Data load

In [2]:
# Loading data
df = pd.read_csv("../data/nypd-motor-vehicle-collisions.csv", low_memory=False)

# Dataset information

In [3]:
df.shape

(1612178, 29)

In [4]:
df.columns

Index(['ACCIDENT DATE', 'ACCIDENT TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,ACCIDENT DATE,ACCIDENT TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2019-08-05T00:00:00.000,16:30,QUEENS,11434,40.676052,-73.790184,"{'type': 'Point', 'coordinates': [-73.790184, ...",,,150-08 123 AVENUE,...,Unspecified,,,,4184637,Sedan,Pick-up Truck,,,
1,2019-08-27T00:00:00.000,16:02,BROOKLYN,11225,40.65778,-73.951096,"{'type': 'Point', 'coordinates': [-73.951096, ...",,,288 HAWTHORNE STREET,...,Unspecified,,,,4195773,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
2,2019-08-15T00:00:00.000,17:57,MANHATTAN,10002,40.718143,-73.993835,"{'type': 'Point', 'coordinates': [-73.993835, ...",CHRYSTIE STREET,GRAND STREET,,...,,,,,4202457,Sedan,,,,
3,2019-08-30T00:00:00.000,21:53,BRONX,10460,40.840534,-73.86661,"{'type': 'Point', 'coordinates': [-73.86661, 4...",,,1837 EAST TREMONT AVENUE,...,Unspecified,,,,4198749,Taxi,Station Wagon/Sport Utility Vehicle,,,
4,2019-08-06T00:00:00.000,9:45,MANHATTAN,10016,40.74544,-73.9754,"{'type': 'Point', 'coordinates': [-73.9754, 40...",EAST 35 STREET,2 AVENUE,,...,Driver Inattention/Distraction,,,,4183798,Station Wagon/Sport Utility Vehicle,Bike,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 29 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   ACCIDENT DATE                  1612178 non-null  object 
 1   ACCIDENT TIME                  1612178 non-null  object 
 2   BOROUGH                        1127553 non-null  object 
 3   ZIP CODE                       1127376 non-null  object 
 4   LATITUDE                       1415893 non-null  float64
 5   LONGITUDE                      1415893 non-null  float64
 6   LOCATION                       1415893 non-null  object 
 7   ON STREET NAME                 1298002 non-null  object 
 8   CROSS STREET NAME              1079193 non-null  object 
 9   OFF STREET NAME                219732 non-null   object 
 10  NUMBER OF PERSONS INJURED      1612161 non-null  float64
 11  NUMBER OF PERSONS KILLED       1612145 non-null  float64
 12  NUMBER OF PEDE

In [7]:
df.describe()

Unnamed: 0,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,COLLISION_ID
count,1415893.0,1415893.0,1612161.0,1612145.0,1612178.0,1612178.0,1612178.0,1612178.0,1612178.0,1612178.0,1612178.0
mean,40.68864,-73.86657,0.2631363,0.001185998,0.05060483,0.0006302034,0.0209859,9.242156e-05,0.1916854,0.0004633483,2765946.0
std,1.200403,2.43864,0.6584832,0.03644859,0.2316972,0.02577872,0.1445222,0.009677481,0.6206497,0.02334547,1506373.0
min,0.0,-201.2371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
25%,40.66882,-73.97746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1014464.0
50%,40.72258,-73.93002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3422826.0
75%,40.76789,-73.86727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3854210.0
max,42.31832,0.0,31.0,8.0,27.0,6.0,4.0,2.0,31.0,5.0,4249104.0


In [8]:
duples = df.duplicated().sum()
print(f"Given dataset has {duples} duplicates")

Given dataset has 394221 duplicates


# Data Cleaning

In [9]:
df_cleaning = df.copy()

## Datetime

In [10]:
df_cleaning["ACCIDENT DATE"]

0          2019-08-05T00:00:00.000
1          2019-08-27T00:00:00.000
2          2019-08-15T00:00:00.000
3          2019-08-30T00:00:00.000
4          2019-08-06T00:00:00.000
                    ...           
1612173    2012-07-21T00:00:00.000
1612174    2012-07-09T00:00:00.000
1612175    2012-07-09T00:00:00.000
1612176    2012-07-18T00:00:00.000
1612177    2012-07-12T00:00:00.000
Name: ACCIDENT DATE, Length: 1612178, dtype: object

In [11]:
df_cleaning["ACCIDENT DATE"] = pd.to_datetime(df_cleaning["ACCIDENT DATE"], format="%Y-%m-%dT%H:%M:%S.%f")
df_cleaning["ACCIDENT DATE FORMATTED"] = df_cleaning["ACCIDENT DATE"].dt.strftime("%d/%m/%Y")
df_cleaning["ACCIDENT DATE"] = df_cleaning["ACCIDENT DATE"].dt.normalize()

In [12]:
df_cleaning["ACCIDENT DATE"].head(5)

0   2019-08-05
1   2019-08-27
2   2019-08-15
3   2019-08-30
4   2019-08-06
Name: ACCIDENT DATE, dtype: datetime64[ns]

In [13]:
df_cleaning["ACCIDENT TIME"].head(5)

0    16:30
1    16:02
2    17:57
3    21:53
4     9:45
Name: ACCIDENT TIME, dtype: object

## Missing boroughs, latitude, longitude and location

In [14]:
df_cleaning["BOROUGH"].unique()

array(['QUEENS', 'BROOKLYN', 'MANHATTAN', 'BRONX', nan, 'STATEN ISLAND'],
      dtype=object)

In [15]:
# Check how many boroughs are missing
missing_boroughs = df_cleaning["BOROUGH"].isna().mean()
print(f"Missing boroughs {round(missing_boroughs*100,2)} %")

Missing boroughs 30.06 %


In [16]:
df_cleaning[["LATITUDE", "LONGITUDE"]].isna().sum()

LATITUDE     196285
LONGITUDE    196285
dtype: int64

In [17]:
# Check if both - latitude and longitude is missing in observation
missing_count = df_cleaning[df_cleaning["LATITUDE"].isna() & df_cleaning["LONGITUDE"].isna()].shape[0]
print(f"Missing both latitude and longitude: {missing_count}")

Missing both latitude and longitude: 196285


## Match coordinates with boroughs from geojson

### Create geometry df

In [18]:
geometry_df = df_cleaning.dropna(subset=["LONGITUDE", "LATITUDE"]).copy()
geometry_df = geometry_df[["LONGITUDE", "LATITUDE", "BOROUGH"]]
# create a column with original index as the original one will be replaced after sjoin
geometry_df["ORIGINAL INDEX"] = geometry_df.index
geometry_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1415893 entries, 0 to 1612177
Data columns (total 4 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   LONGITUDE       1415893 non-null  float64
 1   LATITUDE        1415893 non-null  float64
 2   BOROUGH         1099516 non-null  object 
 3   ORIGINAL INDEX  1415893 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 54.0+ MB


### Create geometry column

In [19]:
create_geometry_column(geometry_df)
geometry_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1415893 entries, 0 to 1612177
Data columns (total 5 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   LONGITUDE       1415893 non-null  float64
 1   LATITUDE        1415893 non-null  float64
 2   BOROUGH         1099516 non-null  object 
 3   ORIGINAL INDEX  1415893 non-null  int64  
 4   GEOMETRY        1415893 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 64.8+ MB


### Convert geometry_df into GeoDataFrame

In [20]:
geometry_gdf = gpd.GeoDataFrame(geometry_df, geometry="GEOMETRY", crs="EPSG:4326")

### Load boroughs from shapefile

In [21]:
# Load borough boundaries
# https://www.nyc.gov/content/planning/pages/resources/datasets/borough-boundaries
boroughs_gdf = gpd.read_file("../data/nybb.shp")

In [22]:
print(boroughs_gdf.columns)
print(boroughs_gdf.crs)

Index(['BoroCode', 'BoroName', 'Shape_Leng', 'Shape_Area', 'geometry'], dtype='object')
EPSG:2263


In [23]:
boroughs_gdf = boroughs_gdf.to_crs(epsg=4326)

### Spatial join

In [24]:
print(boroughs_gdf.columns)

Index(['BoroCode', 'BoroName', 'Shape_Leng', 'Shape_Area', 'geometry'], dtype='object')


In [25]:
geometry_gdf_joined = gpd.sjoin(geometry_gdf, boroughs_gdf[["BoroName", "geometry"]], how = "left", predicate = "intersects")

In [26]:
print(geometry_gdf_joined["BoroName"].unique())

['Queens' 'Brooklyn' 'Manhattan' 'Bronx' 'Staten Island' nan]


In [27]:
geometry_gdf_joined["BoroName"].isna().sum()

np.int64(9256)

In [28]:
geometry_gdf_joined["BoroName"] = geometry_gdf_joined["BoroName"].fillna("UNKNOWN")

In [29]:
geometry_gdf_joined["BoroName"].unique()

array(['Queens', 'Brooklyn', 'Manhattan', 'Bronx', 'Staten Island',
       'UNKNOWN'], dtype=object)

In [30]:
geometry_gdf_joined.head(5)

Unnamed: 0,LONGITUDE,LATITUDE,BOROUGH,ORIGINAL INDEX,GEOMETRY,index_right,BoroName
0,-73.790184,40.676052,QUEENS,0,POINT (-73.79018 40.67605),2.0,Queens
1,-73.951096,40.65778,BROOKLYN,1,POINT (-73.9511 40.65778),4.0,Brooklyn
2,-73.993835,40.718143,MANHATTAN,2,POINT (-73.99384 40.71814),3.0,Manhattan
3,-73.86661,40.840534,BRONX,3,POINT (-73.86661 40.84053),1.0,Bronx
4,-73.9754,40.74544,MANHATTAN,4,POINT (-73.9754 40.74544),3.0,Manhattan


### Fill NaN with mapped boroughs

In [31]:
df_cleaning.loc[geometry_gdf_joined["ORIGINAL INDEX"], "BOROUGH"] = df_cleaning.loc[
    geometry_gdf_joined["ORIGINAL INDEX"], "BOROUGH"
    ].combine_first(geometry_gdf_joined.set_index("ORIGINAL INDEX")["BoroName"])

In [32]:
df_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 30 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   ACCIDENT DATE                  1612178 non-null  datetime64[ns]
 1   ACCIDENT TIME                  1612178 non-null  object        
 2   BOROUGH                        1443930 non-null  object        
 3   ZIP CODE                       1127376 non-null  object        
 4   LATITUDE                       1415893 non-null  float64       
 5   LONGITUDE                      1415893 non-null  float64       
 6   LOCATION                       1415893 non-null  object        
 7   ON STREET NAME                 1298002 non-null  object        
 8   CROSS STREET NAME              1079193 non-null  object        
 9   OFF STREET NAME                219732 non-null   object        
 10  NUMBER OF PERSONS INJURED      1612161 non-null  float

In [33]:
df_cleaning['BOROUGH'].unique()

array(['QUEENS', 'BROOKLYN', 'MANHATTAN', 'BRONX', 'Bronx', 'Queens',
       'Brooklyn', 'STATEN ISLAND', nan, 'Manhattan', 'Staten Island',
       'UNKNOWN'], dtype=object)

In [34]:
df_cleaning["BOROUGH"] = df_cleaning["BOROUGH"].str.upper()

## Label NaN as "UNKNOWN"

In [35]:
cols_to_fill = ["BOROUGH","LATITUDE","LONGITUDE","LOCATION"]
df_cleaning[cols_to_fill] = df_cleaning[cols_to_fill].apply(lambda col: col.fillna("UNKNOWN"))

In [36]:
df_cleaning['BOROUGH'].unique()

array(['QUEENS', 'BROOKLYN', 'MANHATTAN', 'BRONX', 'STATEN ISLAND',
       'UNKNOWN'], dtype=object)

In [37]:
df_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 30 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   ACCIDENT DATE                  1612178 non-null  datetime64[ns]
 1   ACCIDENT TIME                  1612178 non-null  object        
 2   BOROUGH                        1612178 non-null  object        
 3   ZIP CODE                       1127376 non-null  object        
 4   LATITUDE                       1612178 non-null  object        
 5   LONGITUDE                      1612178 non-null  object        
 6   LOCATION                       1612178 non-null  object        
 7   ON STREET NAME                 1298002 non-null  object        
 8   CROSS STREET NAME              1079193 non-null  object        
 9   OFF STREET NAME                219732 non-null   object        
 10  NUMBER OF PERSONS INJURED      1612161 non-null  float

## Number of persons

In [38]:
df_cleaning["NUMBER OF PERSONS INJURED"].unique()

array([ 0.,  1.,  3.,  2.,  4.,  7.,  5.,  6.,  8.,  9., nan, 11., 27.,
       13., 10., 12., 17., 14., 15., 16., 20., 22., 31., 19., 18., 24.])

In [39]:
df_cleaning["NUMBER OF PERSONS INJURED"] = df_cleaning["NUMBER OF PERSONS INJURED"].fillna(0)

In [40]:
df_cleaning["NUMBER OF PERSONS INJURED"].unique()

array([ 0.,  1.,  3.,  2.,  4.,  7.,  5.,  6.,  8.,  9., 11., 27., 13.,
       10., 12., 17., 14., 15., 16., 20., 22., 31., 19., 18., 24.])

In [41]:
df_cleaning["NUMBER OF PERSONS KILLED"].unique()

array([ 0.,  1.,  2., nan,  8.,  4.,  5.,  3.])

In [42]:
df_cleaning["NUMBER OF PERSONS KILLED"] = df_cleaning["NUMBER OF PERSONS KILLED"].fillna(0)

In [43]:
df_cleaning[["NUMBER OF PERSONS INJURED","NUMBER OF PERSONS KILLED"]] = df_cleaning[["NUMBER OF PERSONS INJURED","NUMBER OF PERSONS KILLED"]].astype(int)

In [44]:
df_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 30 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   ACCIDENT DATE                  1612178 non-null  datetime64[ns]
 1   ACCIDENT TIME                  1612178 non-null  object        
 2   BOROUGH                        1612178 non-null  object        
 3   ZIP CODE                       1127376 non-null  object        
 4   LATITUDE                       1612178 non-null  object        
 5   LONGITUDE                      1612178 non-null  object        
 6   LOCATION                       1612178 non-null  object        
 7   ON STREET NAME                 1298002 non-null  object        
 8   CROSS STREET NAME              1079193 non-null  object        
 9   OFF STREET NAME                219732 non-null   object        
 10  NUMBER OF PERSONS INJURED      1612178 non-null  int64

In [45]:
df_cleaning["NUMBER OF MOTORIST KILLED"].unique()

array([0, 1, 2, 5, 3, 4])

## String formatting

In [46]:
cols_to_format = ["CONTRIBUTING FACTOR VEHICLE 1", "CONTRIBUTING FACTOR VEHICLE 2", "CONTRIBUTING FACTOR VEHICLE 3",
                     "CONTRIBUTING FACTOR VEHICLE 4", "CONTRIBUTING FACTOR VEHICLE 5", "VEHICLE TYPE CODE 1", 
                     "VEHICLE TYPE CODE 2","VEHICLE TYPE CODE 3","VEHICLE TYPE CODE 4","VEHICLE TYPE CODE 5",
                 "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME"]
df_cleaning[cols_to_format] = df_cleaning[cols_to_format].fillna("unknown")
df_cleaning[cols_to_format] = df_cleaning[cols_to_format].astype(str).apply(
    lambda col: col.str.strip().str.lower().replace(["unspecified",""], "unknown"))

## Streets

In [47]:
on_street_typos = find_similar_phrases(df_cleaning["ON STREET NAME"], 70)

In [48]:
streets_to_format = ["ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME"]
# Normalization of common street abbreviations
df_cleaning[streets_to_format] = df_cleaning[streets_to_format].apply(
    lambda x: x.apply(normalize_street_names))

## Contributing factors

In [49]:
cfv_typos_cols_1 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 1"])
cfv_typos_cols_2 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 2"])
cfv_typos_cols_3 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 3"])
cfv_typos_cols_4 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 4"])
cfv_typos_cols_5 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 5"])

In [50]:
print(f"CFV 1 typos {cfv_typos_cols_1}")
print(f"CFV 2 typos {cfv_typos_cols_2}")
print(f"CFV 3 typos {cfv_typos_cols_3}")
print(f"CFV 4 typos {cfv_typos_cols_4}")
print(f"CFV 5 typos {cfv_typos_cols_5}")

CFV 1 typos {'Reaction to Uninvolved Vehicle': ['Reaction to Other Uninvolved Vehicle'], 'Drugs (illegal)': ['Drugs (Illegal)'], 'Illnes': ['Illness'], 'Cell Phone (hand-Held)': ['Cell Phone (hand-held)'], 'Reaction to Other Uninvolved Vehicle': ['Reaction to Uninvolved Vehicle'], 'Drugs (Illegal)': ['Drugs (illegal)'], 'Illness': ['Illnes'], 'Cell Phone (hand-held)': ['Cell Phone (hand-Held)']}
CFV 2 typos {'Reaction to Uninvolved Vehicle': ['Reaction to Other Uninvolved Vehicle'], 'Illnes': ['Illness'], 'Cell Phone (hand-Held)': ['Cell Phone (hand-held)'], 'Drugs (illegal)': ['Drugs (Illegal)'], 'Reaction to Other Uninvolved Vehicle': ['Reaction to Uninvolved Vehicle'], 'Illness': ['Illnes'], 'Drugs (Illegal)': ['Drugs (illegal)'], 'Cell Phone (hand-held)': ['Cell Phone (hand-Held)']}
CFV 3 typos {'Reaction to Uninvolved Vehicle': ['Reaction to Other Uninvolved Vehicle'], 'Drugs (illegal)': ['Drugs (Illegal)'], 'Illnes': ['Illness'], 'Illness': ['Illnes'], 'Drugs (Illegal)': ['Drugs 

### Corrections directory

In [51]:
df_cleaning["CONTRIBUTING FACTOR VEHICLE 1"].unique()

array(['passing too closely', 'driver inattention/distraction', 'unknown',
       'unsafe speed', 'reaction to uninvolved vehicle',
       'following too closely', 'passing or lane usage improper',
       'view obstructed/limited', 'unsafe lane changing',
       'aggressive driving/road rage', 'other vehicular',
       'driverless/runaway vehicle', 'turning improperly',
       'failure to yield right-of-way', 'backing unsafely',
       'brakes defective', 'drugs (illegal)', 'pavement defective',
       'traffic control disregarded', 'lane marking improper/inadequate',
       'tire failure/inadequate', 'failure to keep right',
       'obstruction/debris', 'outside car distraction',
       'alcohol involvement', 'driver inexperience', 'pavement slippery',
       'pedestrian/bicyclist/other pedestrian error/confusion',
       'using on board navigation device', 'lost consciousness',
       'passenger distraction', 'oversized vehicle', 'fatigued/drowsy',
       'illnes', 'fell asleep', 'st

In [52]:
corrections_cfv = {"reaction to other uninvolved vehicle":"reaction to uninvolved vehicle", "illnes":"illness", "nan":"unknown", 
                  "80": "unknown", "1": "unknown"}

In [53]:
cfv_cols_to_correct = ["CONTRIBUTING FACTOR VEHICLE 1", "CONTRIBUTING FACTOR VEHICLE 2", "CONTRIBUTING FACTOR VEHICLE 3",
                     "CONTRIBUTING FACTOR VEHICLE 4", "CONTRIBUTING FACTOR VEHICLE 5"]
df_cleaning[cfv_cols_to_correct] = df_cleaning[cfv_cols_to_correct].replace(corrections_cfv)

In [54]:
cfv_typos_cols_1_cleaned = find_similar_phrases(df_cleaning["CONTRIBUTING FACTOR VEHICLE 1"])
print(cfv_typos_cols_1_cleaned)

{}


## Vehicle types

In [55]:
df_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 30 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   ACCIDENT DATE                  1612178 non-null  datetime64[ns]
 1   ACCIDENT TIME                  1612178 non-null  object        
 2   BOROUGH                        1612178 non-null  object        
 3   ZIP CODE                       1127376 non-null  object        
 4   LATITUDE                       1612178 non-null  object        
 5   LONGITUDE                      1612178 non-null  object        
 6   LOCATION                       1612178 non-null  object        
 7   ON STREET NAME                 1612178 non-null  object        
 8   CROSS STREET NAME              1612178 non-null  object        
 9   OFF STREET NAME                1612178 non-null  object        
 10  NUMBER OF PERSONS INJURED      1612178 non-null  int64

In [56]:
# df["VEHICLE TYPE CODE 1"].unique()
vtc1 = find_similar_phrases(df_cleaning["VEHICLE TYPE CODE 1"], 80)
vtc2 = find_similar_phrases(df_cleaning["VEHICLE TYPE CODE 2"])
vtc3 = find_similar_phrases(df_cleaning["VEHICLE TYPE CODE 3"])
vtc4 = find_similar_phrases(df_cleaning["VEHICLE TYPE CODE 4"])
vtc5 = find_similar_phrases(df_cleaning["VEHICLE TYPE CODE 5"])

In [57]:
df_cleaning["VEHICLE TYPE CODE 1"].unique()

array(['sedan', 'station wagon/sport utility vehicle', 'taxi',
       'motorcycle', 'pick-up truck', 'bus', 'box truck', 'bike',
       'tractor truck diesel', 'passenger vehicle', 'van',
       'refrigerated van', 'unknown', 'e-bik', 'garbage or refuse',
       'e-sco', 'dump', 'lift boom', 'ambul', 'ambulance', 'convertible',
       'pick-', 'school bus', '35 ft', 'flat bed', 'fire', 'carry all',
       'fork', 'flat rack', 'polic', 'deliv', 'tractor truck gasoline',
       'tow truck / wrecker', 'scoot', 'pk', 'motorscooter', 'tow t',
       'sport utility / station wagon', 'chassis cab', 'moped',
       'bulk agriculture', 'boxtr', 'motorbike', 'tanker', 'comme',
       'stake or rack', 'mini', 'refg', 'e bik', 'mack', 'lunch wagon',
       'trail', 'g spc', 'fdny', '3-door', 'trlr',
       'pickup with mounted camper', 'beverage truck', 'van t', 'cont',
       'motorized home', 'city', 'armored truck', 'power', 'vespa',
       'van camper', 'limo', 'tract', 'util', 'forkl', 'prks'

In [58]:
print(f"VTC 1 typos {vtc1}")

VTC 1 typos {'bus': ['bu', 'bs'], 'bike': ['e bik', 'ebike'], 'van': ['vang', 'vn'], 'unknown': ['unkno', 'unkow'], 'e-bik': ['ebike', 'e/bik'], 'e-sco': ['e sco'], 'dump': ['dumps', 'dumpt'], 'ambul': ['ambu', 'amabu'], 'pick-': ['pick', 'picku'], 'fire': ['firet'], 'fork': ['forkl', 'fork-'], 'deliv': ['delv', 'delvi'], 'scoot': ['scoo', 'scooter', 'schoo'], 'moped': ['mopd', 'mopad'], 'boxtr': ['box t'], 'tanker': ['tanke', 'tank'], 'comme': ['comm', 'comer', 'commm'], 'e bik': ['bike', 'ebike', 'e- bi'], 'g spc': ['spc p'], 'fdny': ['fd ny'], 'trlr': ['trl'], 'van t': ['van ('], 'cont': ['const'], 'motorized home': ['motor home'], 'limo': ['limou'], 'tract': ['trac', 'track', 'trac.'], 'util': ['utili'], 'forkl': ['fork', 'fork-'], 'tk': ['trk'], 'ltr': ['ltrl', 'tr'], 'wineb': ['winne'], 'ambu': ['ambul', 'amabu', 'amb'], 'box t': ['boxtr'], 'nys a': ['ns am', 'nyc a'], 'elect': ['elec.'], 'unkno': ['unknown', 'unkow'], '4 dr sedan': ['2 dr sedan'], 'schoo': ['scoo', 'scoot'], 'ut

### Vehicle type correction using mapping utils.py

In [59]:
type_cols_to_correct = ["VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2", "VEHICLE TYPE CODE 3", "VEHICLE TYPE CODE 4", "VEHICLE TYPE CODE 5"]

In [60]:
corrected_types_df = map_vehicle_type(
    df_cleaning,
    type_cols_to_correct,
    generalized=True,
    raw_mapping=raw_mapping,
    generalized_mapping=generalized_mapping
)

In [61]:
corrected_types_df["VEHICLE TYPE CODE 1"].unique()

array(['unknown', 'suv', 'taxi / livery', 'motorcycle', 'pickup', 'truck',
       'car', 'van', 'emergency / government',
       'construction / industrial', 'commercial / special purpose',
       'rv / motorhome', 'bus', 'other'], dtype=object)

In [62]:
def check_dict(list, dict):
    not_included = []
    for i in list: 
        if not i in (dict):
            not_included.append(i)
    return not_included

In [70]:
print(check_dict(corrected_types_df["VEHICLE TYPE CODE 1"].unique(), generalized_mapping))

[]


In [72]:
corrected_types_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612178 entries, 0 to 1612177
Data columns (total 30 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   ACCIDENT DATE                  1612178 non-null  datetime64[ns]
 1   ACCIDENT TIME                  1612178 non-null  object        
 2   BOROUGH                        1612178 non-null  object        
 3   ZIP CODE                       1127376 non-null  object        
 4   LATITUDE                       1612178 non-null  object        
 5   LONGITUDE                      1612178 non-null  object        
 6   LOCATION                       1612178 non-null  object        
 7   ON STREET NAME                 1612178 non-null  object        
 8   CROSS STREET NAME              1612178 non-null  object        
 9   OFF STREET NAME                1612178 non-null  object        
 10  NUMBER OF PERSONS INJURED      1612178 non-null  int64

## Duplicates

In [73]:
corrected_types_df.duplicated().sum()

np.int64(394221)

In [74]:
corrected_types_df = corrected_types_df.drop_duplicates()

In [75]:
corrected_types_df.duplicated().sum()

np.int64(0)

## Export data

In [76]:
corrected_types_df.to_csv("../data/nypd-motor-vehicle-collisions-cleaned.csv", index=False)