In [140]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib as plt
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils import create_geometry_column, find_similar_phrases, normalize_street_names

ImportError: cannot import name 'normalize_street_names' from 'utils' (C:\Users\igarl\Desktop\DA_Project\utils.py)

# Data load

In [None]:
# Loading data
df = pd.read_csv("../data/nypd-motor-vehicle-collisions.csv", low_memory=False)

# Dataset information

In [None]:
print(df.shape)

In [None]:
print(df.columns)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Data Cleaning

## Deleting unnecessary columns

In [None]:
del df["ZIP CODE"]

## Datetime

In [None]:
df["ACCIDENT DATE"]

In [None]:
df["ACCIDENT DATE"] = pd.to_datetime(df["ACCIDENT DATE"], format="%Y-%m-%dT%H:%M:%S.%f")
df["ACCIDENT DATE FORMATTED"] = df["ACCIDENT DATE"].dt.strftime("%d/%m/%Y")
df["ACCIDENT DATE"] = df["ACCIDENT DATE"].dt.normalize()

In [None]:
df["ACCIDENT DATE"].head(5)

In [None]:
df["ACCIDENT TIME"].head(5)

## Missing boroughs, latitude, longitude and location

In [None]:
df["BOROUGH"].unique()

In [None]:
# Check how many boroughs are missing
missing_boroughs = df["BOROUGH"].isna().mean()
print(f"Missing boroughs {round(missing_boroughs*100,2)} %")

In [None]:
df[["LATITUDE", "LONGITUDE"]].isna().sum()

In [None]:
# Check if both - latitude and longitude is missing in observation
missing_count = df[df["LATITUDE"].isna() & df["LONGITUDE"].isna()].shape[0]
print(f"Missing both latitude and longitude: {missing_count}")

## Match coordinates with boroughs from geojson

### Create geometry df

In [None]:
geometry_df = df.dropna(subset=["LONGITUDE", "LATITUDE"]).copy()
geometry_df = geometry_df[["LONGITUDE", "LATITUDE", "BOROUGH"]]
# create a column with original index as the original one will be replaced after sjoin
geometry_df["ORIGINAL INDEX"] = geometry_df.index
geometry_df.info()

### Create geometry column

In [None]:
create_geometry_column(geometry_df)
geometry_df.info()

### Convert geometry_df into GeoDataFrame

In [None]:
geometry_gdf = gpd.GeoDataFrame(geometry_df, geometry="GEOMETRY", crs="EPSG:4326")

### Load boroughs from geojson

In [None]:
# Load borough boundaries
# https://www.nyc.gov/content/planning/pages/resources/datasets/borough-boundaries
boroughs_gdf = gpd.read_file("../data/nybb.shp")

In [None]:
print(boroughs_gdf.columns)
print(boroughs_gdf.crs)

In [None]:
boroughs_gdf = boroughs_gdf.to_crs(epsg=4326)

### Spatial join

In [None]:
print(boroughs_gdf.columns)

In [None]:
geometry_gdf_joined = gpd.sjoin(geometry_gdf, boroughs_gdf[["BoroName", "geometry"]], how = "left", predicate = "intersects")

In [None]:
print(geometry_gdf_joined["BoroName"].unique())

In [None]:
geometry_gdf_joined["BoroName"].isna().sum()

In [None]:
geometry_gdf_joined["BoroName"] = geometry_gdf_joined["BoroName"].fillna("Unknown")

In [None]:
geometry_gdf_joined["BoroName"].unique()

In [None]:
geometry_gdf_joined["BoroName"] = geometry_gdf_joined["BoroName"].astype(str).str.upper()

In [None]:
geometry_gdf_joined.head(5)

### Fill NaN with mapped boroughs

In [None]:
df.loc[geometry_gdf_joined["ORIGINAL INDEX"], "BOROUGH"] = df.loc[
    geometry_gdf_joined["ORIGINAL INDEX"], "BOROUGH"
    ].combine_first(geometry_gdf_joined.set_index("ORIGINAL INDEX")["BoroName"])

In [None]:
df.info()

In [None]:
df['BOROUGH'].unique()

## Label NaN as "Unknown"

In [None]:
cols_to_fill = ["BOROUGH","LATITUDE","LONGITUDE","LOCATION"]
df[cols_to_fill] = df[cols_to_fill].apply(lambda col: col.fillna("UNKNOWN"))

In [None]:
df['BOROUGH'].unique()

In [None]:
df.info()

## Number of persons

In [None]:
df["NUMBER OF PERSONS INJURED"].unique()

In [None]:
df["NUMBER OF PERSONS INJURED"] = df["NUMBER OF PERSONS INJURED"].astype("Int64")

In [None]:
df["NUMBER OF PERSONS INJURED"].unique()

In [None]:
df["NUMBER OF PERSONS KILLED"].unique()

In [None]:
df["NUMBER OF PERSONS KILLED"] = df["NUMBER OF PERSONS KILLED"].astype("Int64")

In [None]:
df.info()


In [None]:
df["NUMBER OF MOTORIST KILLED"].unique()

## String formatting

In [None]:
cols_to_format = ["CONTRIBUTING FACTOR VEHICLE 1", "CONTRIBUTING FACTOR VEHICLE 2", "CONTRIBUTING FACTOR VEHICLE 3",
                     "CONTRIBUTING FACTOR VEHICLE 4", "CONTRIBUTING FACTOR VEHICLE 5", "VEHICLE TYPE CODE 1", 
                     "VEHICLE TYPE CODE 2","VEHICLE TYPE CODE 3","VEHICLE TYPE CODE 4","VEHICLE TYPE CODE 5",
                 "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME"]
df[cols_to_format] = df[cols_to_format].astype(str).apply(
    lambda col: col.str.strip().str.lower())

## Streets

In [None]:
on_street_typos = find_similar_phrases(df["ON STREET NAME"], 70)

In [None]:
print(on_street_typos)

In [None]:
streets_to_format = ["ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME"]
df[streets_to_format] = df[streets_to_format].applymap(normalize_street_names)

## Contributing factors

In [None]:
cfv_typos_cols_1 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 1"])
cfv_typos_cols_2 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 2"])
cfv_typos_cols_3 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 3"])
cfv_typos_cols_4 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 4"])
cfv_typos_cols_5 = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 5"])

In [None]:
print(f"CFV 1 typos {cfv_typos_cols_1}")
print(f"CFV 2 typos {cfv_typos_cols_2}")
print(f"CFV 3 typos {cfv_typos_cols_3}")
print(f"CFV 4 typos {cfv_typos_cols_4}")
print(f"CFV 5 typos {cfv_typos_cols_5}")

### Corrections directory

In [None]:
df["CONTRIBUTING FACTOR VEHICLE 1"].unique()

In [None]:
corrections_cfv = {"reaction to other uninvolved vehicle":"reaction to uninvolved vehicle", "illnes":"illness", "nan":"unspecified", 
                  "80": "unspecified", "1": "unspecified"}

In [None]:
cfv_cols_to_correct = ["CONTRIBUTING FACTOR VEHICLE 1", "CONTRIBUTING FACTOR VEHICLE 2", "CONTRIBUTING FACTOR VEHICLE 3",
                     "CONTRIBUTING FACTOR VEHICLE 4", "CONTRIBUTING FACTOR VEHICLE 5"]
df[cfv_cols_to_correct] = df[cfv_cols_to_correct].replace(corrections_cfv)

In [None]:
cfv_typos_cols_1_cleaned = find_similar_phrases(df["CONTRIBUTING FACTOR VEHICLE 1"])
print(cfv_typos_cols_1_cleaned)

## Vehicle types

In [None]:
df.info()