In [1]:
# Importing libraries
import numpy as np
import pandas as pd

In [2]:
# Reading our csv files
df = pd.read_csv('NYCM_100000.csv')

In [3]:
# Getting a random sample of the data
df.sample()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
81994,2020-04-11T00:00:00.000,14:44,,,40.666588,-73.80809,"(40.666588, -73.80809)",BELT PARKWAY,,,...,Following Too Closely,,,,4308414,Sedan,Sedan,,,


In [4]:
# Getting an idea of what datetypes exist in the dataframe
# Taking notes of possible flaw in the datatype of some columns
df.dtypes

crash_date                        object
crash_time                        object
borough                           object
zip_code                         float64
latitude                         float64
longitude                        float64
location                          object
on_street_name                    object
off_street_name                   object
cross_street_name                 object
number_of_persons_injured          int64
number_of_persons_killed           int64
number_of_pedestrians_injured      int64
number_of_pedestrians_killed       int64
number_of_cyclist_injured          int64
number_of_cyclist_killed           int64
number_of_motorist_injured         int64
number_of_motorist_killed          int64
contributing_factor_vehicle_1     object
contributing_factor_vehicle_2     object
contributing_factor_vehicle_3     object
contributing_factor_vehicle_4     object
contributing_factor_vehicle_5     object
collision_id                       int64
vehicle_type_cod

In [5]:
# Changing 'crash_date' datatype formate to the actual panda's datetime format 
df['crash_date'] = pd.to_datetime(df['crash_date'])

In [6]:
# Importing datetime libraries to make use of some of it's power
import datetime

In [7]:
# Converting 'crash_time' string format to its %H:%M real datetime format
df['crash_time'] = df['crash_time'].apply(lambda x:datetime.datetime.strptime(x,'%H:%M'))

In [8]:
# Making sure that the changes has taken its place in the dataframe 
df.dtypes

crash_date                       datetime64[ns]
crash_time                       datetime64[ns]
borough                                  object
zip_code                                float64
latitude                                float64
longitude                               float64
location                                 object
on_street_name                           object
off_street_name                          object
cross_street_name                        object
number_of_persons_injured                 int64
number_of_persons_killed                  int64
number_of_pedestrians_injured             int64
number_of_pedestrians_killed              int64
number_of_cyclist_injured                 int64
number_of_cyclist_killed                  int64
number_of_motorist_injured                int64
number_of_motorist_killed                 int64
contributing_factor_vehicle_1            object
contributing_factor_vehicle_2            object
contributing_factor_vehicle_3           

In [9]:
# Now taking care of missing values
# Taking a glance of "How many missing value" are we talking about
df.isnull().sum()

crash_date                           0
crash_time                           0
borough                          35026
zip_code                         35034
latitude                          8035
longitude                         8035
location                          8035
on_street_name                   26009
off_street_name                  52875
cross_street_name                74033
number_of_persons_injured            0
number_of_persons_killed             0
number_of_pedestrians_injured        0
number_of_pedestrians_killed         0
number_of_cyclist_injured            0
number_of_cyclist_killed             0
number_of_motorist_injured           0
number_of_motorist_killed            0
contributing_factor_vehicle_1      371
contributing_factor_vehicle_2    19243
contributing_factor_vehicle_3    91239
contributing_factor_vehicle_4    97760
contributing_factor_vehicle_5    99333
collision_id                         0
vehicle_type_code1                 740
vehicle_type_code2       

In [20]:
# Trying to make a list of all possible columns containing missing values
null_list=[i for i in df.columns if df[i].isnull().sum() > 1]

In [21]:
# This list contains a 3 int64 datatype that we need to clean first
# so we can apply string modules on the rest (object data types)
b=['zip_code','latitude','longitude']
object_null_list= [i for i in null_list if i not in b]

In [22]:
# Created a list of all columns with "Object" Datatype
object_null_list

['borough',
 'location',
 'on_street_name',
 'off_street_name',
 'cross_street_name',
 'contributing_factor_vehicle_1',
 'contributing_factor_vehicle_2',
 'contributing_factor_vehicle_3',
 'contributing_factor_vehicle_4',
 'contributing_factor_vehicle_5',
 'vehicle_type_code1',
 'vehicle_type_code2',
 'vehicle_type_code_3',
 'vehicle_type_code_4',
 'vehicle_type_code_5']

In [25]:
# Making sure where the object is np.NAN, we replace is with the string 'Unspecified'
df[object_null_list]=df[object_null_list].apply(lambda x: x.fillna('Unspecified'))

In [26]:
# Proof of that our trick worked, Now let's take care of the columns
# With the int64 datatype
df.isnull().sum()

crash_date                           0
crash_time                           0
borough                              0
zip_code                         35034
latitude                          8035
longitude                         8035
location                             0
on_street_name                       0
off_street_name                      0
cross_street_name                    0
number_of_persons_injured            0
number_of_persons_killed             0
number_of_pedestrians_injured        0
number_of_pedestrians_killed         0
number_of_cyclist_injured            0
number_of_cyclist_killed             0
number_of_motorist_injured           0
number_of_motorist_killed            0
contributing_factor_vehicle_1        0
contributing_factor_vehicle_2        0
contributing_factor_vehicle_3        0
contributing_factor_vehicle_4        0
contributing_factor_vehicle_5        0
collision_id                         0
vehicle_type_code1                   0
vehicle_type_code2       

In [28]:
# Here we are making sure to replace np.NAN columns values with a proper 0, (float) formate
df[b]=df[b].apply(lambda x:x.fillna(0,))

In [29]:
# Proof of working flow
df.isnull().sum()

crash_date                       0
crash_time                       0
borough                          0
zip_code                         0
latitude                         0
longitude                        0
location                         0
on_street_name                   0
off_street_name                  0
cross_street_name                0
number_of_persons_injured        0
number_of_persons_killed         0
number_of_pedestrians_injured    0
number_of_pedestrians_killed     0
number_of_cyclist_injured        0
number_of_cyclist_killed         0
number_of_motorist_injured       0
number_of_motorist_killed        0
contributing_factor_vehicle_1    0
contributing_factor_vehicle_2    0
contributing_factor_vehicle_3    0
contributing_factor_vehicle_4    0
contributing_factor_vehicle_5    0
collision_id                     0
vehicle_type_code1               0
vehicle_type_code2               0
vehicle_type_code_3              0
vehicle_type_code_4              0
vehicle_type_code_5 

In [30]:
# Getting a back another glance to see how the change are looking
df.sample()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
57282,2020-08-15,1900-01-01 02:39:00,Unspecified,0.0,40.60715,-73.89837,"(40.60715, -73.89837)",BELT PARKWAY,Unspecified,Unspecified,...,Unspecified,Unspecified,Unspecified,Unspecified,4338950,Sedan,Sedan,Unspecified,Unspecified,Unspecified


In [31]:
# Let's take care of the begining and trailing blank space in our columns values
object_types=[i for i in df.columns if df.dtypes[i]== np.object]

In [32]:
# Making sure that the strip does its job, and it does
for i in object_types:
    df[i].map(lambda x: x.strip())

In [33]:
# Since this table has got a beautifull "Collision_id" we can use it
# to take out any duplicate data if exists
dup = df[df.duplicated(['collision_id'])]

In [34]:
# Luckily our dataframe has got no duplicate data enteries
dup

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5


In [None]:
# Finally we can gladly save our cleaned version of the dataframe in our 
# Newly created custom csv file
df.to_csv('Cleaned.csv', index=False)