# Prepare

**Remeber Goal of project**
- Find drivers of collisions and how to reduce death

**import**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import sys
sys.path.append("./util_")
import acquire_
import prepare_

### get data

In [2]:
# get data from acquire file
vehicle = acquire_.get_data()
vehicle.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609,,,,,


## Rename column names

In [3]:
# 1. make every columns lower case
# 2. replace all the spaces with inderscore
vehicle = vehicle.rename(columns=lambda x: x.lower().replace(" ", "_"))

vehicle.head(1)

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,cross_street_name,off_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,


## Fix column data types

**Object to datetime**

In [4]:
# convert string to datetime
vehicle["crash_date"] = pd.to_datetime(vehicle['crash_date'])
vehicle["crash_time"] = pd.to_datetime(vehicle['crash_time'])

**Float to intiger**

Remove `NaN` in these two columns because the very inssignificant
- `number_of_persons_injured`: 18 rows count
- `number_of_persons_killed`: 31 row count

In [5]:
# removeing all the null columns for number_of_persons_injured and number_of_persons_killed
vehicle = vehicle[vehicle.number_of_persons_injured.notna()]
vehicle = vehicle[vehicle.number_of_persons_killed.notna()]

In [6]:
# change column data type from float to int
vehicle.number_of_persons_injured = vehicle.number_of_persons_injured.astype("int")
vehicle.number_of_persons_killed = vehicle.number_of_persons_killed.astype("int")

**Object to intiger**

In [7]:
# replace NaN with 00000 to signify unknown zipcode
vehicle.zip_code = vehicle.zip_code.fillna("0")

# replace empty strings with 0 to signify unknown zipcode
vehicle.zip_code = vehicle.zip_code.str.replace(" ", "0")

# make zipcode column numeric
vehicle.zip_code = vehicle.zip_code.astype("int")

**Fill None**

In [8]:
# replace all None in object columns with Unknown
vehicle[vehicle.select_dtypes("object").columns] = vehicle.select_dtypes("object").fillna("UNKNOWN")

## Remove columns

remove `location` because it's not tidy and there is also longitude and latitude describing this column.

In [9]:
# drop redundent columns
remove_cols = ["location", "collision_id"]
vehicle = prepare_.drop_cols(vehicle, remove_cols)

Original dataframe size: (2004974, 29)
New dataframe size: (2004974, 27)


## Encode categorical variable

In [10]:
vehicle.select_dtypes("object").head(3)

Unnamed: 0,borough,on_street_name,cross_street_name,off_street_name,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,UNKNOWN,WHITESTONE EXPRESSWAY,20 AVENUE,UNKNOWN,Aggressive Driving/Road Rage,Unspecified,UNKNOWN,UNKNOWN,UNKNOWN,Sedan,Sedan,UNKNOWN,UNKNOWN,UNKNOWN
1,UNKNOWN,QUEENSBORO BRIDGE UPPER,UNKNOWN,UNKNOWN,Pavement Slippery,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,Sedan,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
2,UNKNOWN,THROGS NECK BRIDGE,UNKNOWN,UNKNOWN,Following Too Closely,Unspecified,UNKNOWN,UNKNOWN,UNKNOWN,Sedan,Pick-up Truck,UNKNOWN,UNKNOWN,UNKNOWN


In [11]:
# # get all columns from dataframe
# all_columns = vehicle.columns

# # containers of different variable types
# categorical = []

# # separate variables
# for col in all_columns:
#     # count number of unique valus in the column
#     len_of_uniq = len(vehicle[col].unique())
    
#     # also checking for only object data types
#     if (col != "churn") and (len_of_uniq <= 10) and (vehicle[col].dtype == "O"):
#         categorical.append(col)
#     else: pass
        
# categorical

## Group alike factors

In [12]:
# unique counts in object columns
vehicle.select_dtypes("object").nunique()

borough                               6
on_street_name                    17902
cross_street_name                 19991
off_street_name                  212710
contributing_factor_vehicle_1        62
contributing_factor_vehicle_2        62
contributing_factor_vehicle_3        52
contributing_factor_vehicle_4        42
contributing_factor_vehicle_5        31
vehicle_type_code_1                1539
vehicle_type_code_2                1702
vehicle_type_code_3                 243
vehicle_type_code_4                  98
vehicle_type_code_5                  66
dtype: int64

In [39]:
# Group similar factors together   
factors_1 = {
    "Visibility and Road Conditions":["Windshield Inadequate",
                                      "Headlights Defective",
                                      "Other Lighting Defects",
                                      "Glare",
                                      "View Obstructed/Limited",
                                      "Pavement Slippery",
                                      "Obstruction/Debris",
                                      "Pavement Defective"],
    "Distractions from Electronic Devices":["Cell Phone (hand-Held)", "Cell Phone (hand-held)",
                                            "Cell Phone (hands-free)",
                                            "Other Electronic Device",
                                            "Outside Car Distraction"],
    "Impairment (Alcohol, Drugs, Medication)":["Alcohol Involvement",
                                               "Drugs (illegal)","Drugs (Illegal)",
                                               "Prescription Medication"],
    "Driver Fatigue and Inattention":["Fell Asleep",
                                      "Lost Consciousness",
                                      "Fatigued/Drowsy",
                                      "Illnes", "Illness",
                                      "Unsafe Speed",
                                      "Driver Inattention/Distraction",
                                     "80"],
    "Unsafe Driving Maneuvers":["Unsafe Lane Changing",
                                "Passing Too Closely",
                                "Turning Improperly",
                                "Passing or Lane Usage Improper",
                                "Failure to Yield Right-of-Way",
                                "Failure to Keep Right"],
    "Vehicle Equipment Failure":["Following Too Closely",
                                 "Traffic Control Disregarded",
                                 "Accelerator Defective",
                                 "Brakes Defective",
                                 "Steering Failure",
                                 "Tire Failure/Inadequate"],
    "Issues with Traffic Control and Lane Marking":["Traffic Control Device Improper/Non-Working",
                                                    "Lane Marking Improper/Inadequate"],
    "Driver Characteristics and Experience":["Physical Disability",
                                             "Driver Inexperience"],
    "Reactions to Other Vehicles":["Reaction to Other Uninvolved Vehicle",
                                   "Reaction to Uninvolved Vehicle"],
    "Distracted Driving":["Listening/Using Headphones",
                          "Texting",
                          "Eating or Drinking",
                         "Distracted Driving"],
    "Vehicle-related Incidents":["Vehicle Vandalism",
                                 "Tow Hitch Defective",
                                 "Driverless/Runaway Vehicle",
                                 "Oversized Vehicle",
                                 "Other Vehicular"],
    "Interactions with Pedestrians and Cyclists":["Animals Action",
                                                  "Pedestrian/Bicyclist/Other Pedestrian Error/Confusion"],
    "Aggressive Driving and Passenger Distraction":["Aggressive Driving/Road Rage",
                                                    "Passenger Distraction"],
    "Unsafe Lane Changes and Backing":["Unsafe Lane Changing",
                                       "Passing Too Closely",
                                       "Turning Improperly",
                                       "Passing or Lane Usage Improper",
                                       "Backing Unsafely"],
    "Other":["Using On Board Navigation Device",
            "Tinted Windows",
            "Shoulders Defective/Improper"],
    "Uncertain or Unspecified Factors":["UNKNOWN",
                                        "Unspecified",
                                       "1",
                                       ""]
}

# Re-assigning new names to the feature items.
for k, v in factors_1.items(): # iterate trough the keys and values of the dictionary
    for ele in v: # iterate throug only the values 
        # Replace the entire cell with 'replacement_value' if a match is found
        vehicle["contributing_factor_vehicle_1"] = vehicle["contributing_factor_vehicle_1"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_2"] = vehicle["contributing_factor_vehicle_2"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_3"] = vehicle["contributing_factor_vehicle_3"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_4"] = vehicle["contributing_factor_vehicle_4"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_5"] = vehicle["contributing_factor_vehicle_5"].apply(lambda x: k if x == ele else x)
        
        

In [42]:
vehicle.contributing_factor_vehicle_5.value_counts()

Uncertain or Unspecified Factors                2004485
Vehicle-related Incidents                           164
Driver Fatigue and Inattention                      117
Vehicle Equipment Failure                            96
Visibility and Road Conditions                       57
Impairment (Alcohol, Drugs, Medication)              12
Reactions to Other Vehicles                          11
Unsafe Driving Maneuvers                             10
Driver Characteristics and Experience                10
Distractions from Electronic Devices                  9
Aggressive Driving and Passenger Distraction          1
Unsafe Lane Changes and Backing                       1
Issues with Traffic Control and Lane Marking          1
Name: contributing_factor_vehicle_5, dtype: int64

## Split data