# Clean_Data_BAD

## Imported libraries

In [230]:
import pandas as pd #To work with dataframes
import numpy as np #To work with np.arrays

## Reading the data

In [231]:
df = pd.read_csv('./datasets/data_100000.csv')

## Preprocessing the data

#### Dropping columns with excessive (or duplicate) information

In [233]:
"""
information about the dropped columns: 
- contributing_factor_vehicle_1 to contributing_factor_vehicle_5: Not neccessary to know the most dangerous streets
- vehicle_type_code1, vehicle_type_code2: Not neccessary to know the most dangerous streets
- vehicle_type_code_3 tot vehicle_type_code_5: Not neccessary to know the most dangerous streets
- location: Because the latitude and longitude are already available, this information is excessive
- zip_code: Assuming that streetnames are not used more than once for all zip_codes
- borough: Assuming that streetnames are not used more than once for all boroughs
- off_street_name: Only focussing on on_street_name
- cross_street_name: Only focussing on on_street_name

"""

df = df.drop(['contributing_factor_vehicle_1',
         'contributing_factor_vehicle_2',
         'contributing_factor_vehicle_3',
        'contributing_factor_vehicle_4',
        'contributing_factor_vehicle_5',
              "vehicle_type_code1",
              "vehicle_type_code2",
              "vehicle_type_code_3",
              "vehicle_type_code_4",
              "vehicle_type_code_5",
              'location',
              'zip_code',
             'borough',
              'off_street_name',
             'cross_street_name'], axis=1)

#### Replacing empty values("" of null) with NaN

In [234]:
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.replace(["null","NULL","Null"], np.nan)

#### Removing all rows that contain NaN values within column on_street_name

In [235]:
df = df[df['on_street_name'].notna()]

#### Changing the dataframe to dtype: "string"

In [236]:
df = df.applymap(str)

#### Removing all the excessive white spaces

In [237]:
df = df.applymap(str.strip).rename(columns=str.strip)

#### Chaning the dtypes of each column to the correct form and consolidate them if necessary

In [239]:
df.crash_date = df.crash_date.astype('datetime64')

df.on_street_name = df.on_street_name.astype('string').str.upper()

df.number_of_persons_killed = df.number_of_persons_killed.astype('int8')
df.number_of_persons_injured = df.number_of_persons_injured.astype('int8')
df.number_of_pedestrians_killed = df.number_of_pedestrians_killed.astype('int8')
df.number_of_pedestrians_injured = df.number_of_pedestrians_injured.astype('int8')
df.number_of_cyclist_killed = df.number_of_cyclist_killed.astype('int8')
df.number_of_cyclist_injured = df.number_of_cyclist_injured.astype('int8')
df.number_of_motorist_killed = df.number_of_motorist_killed.astype('int8')
df.number_of_motorist_injured = df.number_of_motorist_injured.astype('int8')

#### Setting collision_id as index

In [240]:
df = df.set_index("collision_id")

#### Replacing all "NAN" string values with np.nan (NaN) values

In [241]:
df = df.replace(["NAN"], np.nan)

## Saving the preprocessed data as .csv file

In [242]:
df.to_csv("Clean_data_BAD.csv")