# Prepare

**Remeber Goal of project**
- Find drivers of collisions and how to reduce death

**import**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("./util_")
import acquire_
import prepare_

### get data

In [2]:
# get data from acquire file
vehicle = acquire_.get_data()
vehicle.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609,,,,,


## Rename column names

In [3]:
# 1. make every columns lower case
# 2. replace all the spaces with inderscore
vehicle = vehicle.rename(columns=lambda x: x.lower().replace(" ", "_"))

vehicle.head(1)

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,cross_street_name,off_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,


## Fix column data types

**Object to datetime**

In [4]:
# convert string to datetime
vehicle["crash_date"] = pd.to_datetime(vehicle['crash_date'])
vehicle["crash_time"] = pd.to_datetime(vehicle['crash_time'])

**Float to intiger**

Remove `NaN` in these two columns because the very inssignificant
- `number_of_persons_injured`: 18 rows count
- `number_of_persons_killed`: 31 row count

In [5]:
# removeing all the null columns for number_of_persons_injured and number_of_persons_killed
vehicle = vehicle[vehicle.number_of_persons_injured.notna()]
vehicle = vehicle[vehicle.number_of_persons_killed.notna()]

In [6]:
# change column data type from float to int
vehicle.number_of_persons_injured = vehicle.number_of_persons_injured.astype("int")
vehicle.number_of_persons_killed = vehicle.number_of_persons_killed.astype("int")

**Object to intiger**

In [7]:
# replace NaN with 00000 to signify unknown zipcode
vehicle.zip_code = vehicle.zip_code.fillna("0")

# replace empty strings with 0 to signify unknown zipcode
vehicle.zip_code = vehicle.zip_code.str.replace(" ", "0")

# make zipcode column numeric
vehicle.zip_code = vehicle.zip_code.astype("int")

**Fill None**

In [8]:
# replace all None in object columns with Unknown
vehicle[vehicle.select_dtypes("object").columns] = vehicle.select_dtypes("object").fillna("UNKNOWN")

## Group alike objects

In [9]:
# unique counts in object columns
vehicle.select_dtypes("object").nunique()

borough                               6
location                         271797
on_street_name                    17902
cross_street_name                 19991
off_street_name                  212710
contributing_factor_vehicle_1        62
contributing_factor_vehicle_2        62
contributing_factor_vehicle_3        52
contributing_factor_vehicle_4        42
contributing_factor_vehicle_5        31
vehicle_type_code_1                1539
vehicle_type_code_2                1702
vehicle_type_code_3                 243
vehicle_type_code_4                  98
vehicle_type_code_5                  66
dtype: int64

**Contributing factor**

In [10]:
# Group similar factors together   
contributing_factor = {
    "Visibility and Road Conditions":["Windshield Inadequate",
                                      "Headlights Defective",
                                      "Other Lighting Defects",
                                      "Glare",
                                      "View Obstructed/Limited",
                                      "Pavement Slippery",
                                      "Obstruction/Debris",
                                      "Pavement Defective"],
    "Distractions from Electronic Devices":["Cell Phone (hand-Held)", "Cell Phone (hand-held)",
                                            "Cell Phone (hands-free)",
                                            "Other Electronic Device",
                                            "Outside Car Distraction"],
    "Impairment (Alcohol, Drugs, Medication)":["Alcohol Involvement",
                                               "Drugs (illegal)","Drugs (Illegal)",
                                               "Prescription Medication"],
    "Driver Fatigue and Inattention":["Fell Asleep",
                                      "Lost Consciousness",
                                      "Fatigued/Drowsy",
                                      "Illnes", "Illness",
                                      "Unsafe Speed",
                                      "Driver Inattention/Distraction",
                                     "80"],
    "Unsafe Driving Maneuvers":["Unsafe Lane Changing",
                                "Passing Too Closely",
                                "Turning Improperly",
                                "Passing or Lane Usage Improper",
                                "Failure to Yield Right-of-Way",
                                "Failure to Keep Right"],
    "Vehicle Equipment Failure":["Following Too Closely",
                                 "Traffic Control Disregarded",
                                 "Accelerator Defective",
                                 "Brakes Defective",
                                 "Steering Failure",
                                 "Tire Failure/Inadequate"],
    "Issues with Traffic Control and Lane Marking":["Traffic Control Device Improper/Non-Working",
                                                    "Lane Marking Improper/Inadequate"],
    "Driver Characteristics and Experience":["Physical Disability",
                                             "Driver Inexperience"],
    "Reactions to Other Vehicles":["Reaction to Other Uninvolved Vehicle",
                                   "Reaction to Uninvolved Vehicle"],
    "Distracted Driving":["Listening/Using Headphones",
                          "Texting",
                          "Eating or Drinking",
                         "Distracted Driving"],
    "Vehicle-related Incidents":["Vehicle Vandalism",
                                 "Tow Hitch Defective",
                                 "Driverless/Runaway Vehicle",
                                 "Oversized Vehicle",
                                 "Other Vehicular"],
    "Interactions with Pedestrians and Cyclists":["Animals Action",
                                                  "Pedestrian/Bicyclist/Other Pedestrian Error/Confusion"],
    "Aggressive Driving and Passenger Distraction":["Aggressive Driving/Road Rage",
                                                    "Passenger Distraction"],
    "Unsafe Lane Changes and Backing":["Unsafe Lane Changing",
                                       "Passing Too Closely",
                                       "Turning Improperly",
                                       "Passing or Lane Usage Improper",
                                       "Backing Unsafely"],
    "Other":["Using On Board Navigation Device",
            "Tinted Windows",
            "Shoulders Defective/Improper"],
    "Uncertain or Unspecified Factors":["UNKNOWN",
                                        "Unspecified",
                                       "1",
                                       ""]
}

# Re-assigning new names to the feature items.
for k, v in contributing_factor.items(): # iterate trough the keys and values of the dictionary
    for ele in v: # iterate throug only the values 
        # Replace the entire cell with 'replacement_value' if a match is found
        vehicle["contributing_factor_vehicle_1"] = vehicle["contributing_factor_vehicle_1"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_2"] = vehicle["contributing_factor_vehicle_2"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_3"] = vehicle["contributing_factor_vehicle_3"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_4"] = vehicle["contributing_factor_vehicle_4"].apply(lambda x: k if x == ele else x)
        vehicle["contributing_factor_vehicle_5"] = vehicle["contributing_factor_vehicle_5"].apply(lambda x: k if x == ele else x)
        
        

**Vehicle type**

In [11]:
# Separete large vehicle control code from smaller counts
veh_contr_code = vehicle.vehicle_type_code_1.value_counts()[vehicle.vehicle_type_code_1.value_counts() > 20]

print("Count of categories greater than 20:",len(veh_contr_code))
veh_contr_code

Count of categories greater than 20: 130


Sedan                                  544624
Station Wagon/Sport Utility Vehicle    429111
PASSENGER VEHICLE                      416206
SPORT UTILITY / STATION WAGON          180291
Taxi                                    49660
                                        ...  
pick                                       22
unk                                        22
COMMERCIAL                                 21
FDNY FIRE                                  21
FDNY TRUCK                                 21
Name: vehicle_type_code_1, Length: 130, dtype: int64

In [12]:
# Group the categories
vehicle_categories = {
    'SEDAN': ['Sedan', '4 dr sedan', '2 dr sedan', '3-Door'],
    'STATION_WAGON': ['Station Wagon/Sport Utility Vehicle', 'SPORT UTILITY / STATION WAGON'],
    'PASSENGER_VEHICLE': ['PASSENGER VEHICLE'],
    'TAXI': ['Taxi', 'TAXI'],
    'PICKUP_TRUCK': ['Pick-up Truck', 'PICK-UP TRUCK', 'PK', 'Pickup with mounted Camper', 'pick'],
    'UNKNOWN': ['UNKNOWN', 'UNKNO', 'UNK','unknown', 'unko', 'unk'],
    'VAN': ['VAN', 'van', 'Van', 'Van Camper'],
    'BOX_TRUCK': ['Box Truck', 'BOX T', 'BOX TRUCK'],
    'BUS': ['Bus', 'BUS', 'School Bus'],
    'LARGE_COM_VEH': ['LARGE COM VEH(6 OR MORE TIRES)'],
    'SMALL_COM_VEH': ['SMALL COM VEH(4 TIRES)', 'COMME'],
    'LIVERY_VEHICLE': ['LIVERY VEHICLE'],
    'TRACTOR_TRUCK_DIESEL': ['Tractor Truck Diesel', 'TRACT'],
    'MOTORCYCLE': ['Bike', 'MOTORCYCLE', 'Motorscooter', 'Moped', 'Minibike', 'Minicycle'],
    'AMBULANCE': ['Ambulance', 'AMBULANCE', 'AMBUL', 'ambul', 'AMB', 'FDNY AMBUL', 'fdny'],
    'CONVERTIBLE': ['Convertible'],
    'DUMP': ['Dump', 'dump'],
    'E_BIKE': ['E-Bike', 'E-Bik', 'ELECT'],
    'FLAT_BED': ['Flat Bed', 'FLAT'],
    'GARBAGE_OR_REFUSE': ['Garbage or Refuse'],
    'CARRY_ALL': ['Carry All'],
    'E_SCOOTER': ['E-Scooter', 'E-Sco'],
    'TRACTOR_TRUCK_GASOLINE': ['Tractor Truck Gasoline', 'FORD'],
    'TOW_TRUCK': ['Tow Truck / Wrecker', 'Tow Truck', 'TOW T'],
    'FIRE_TRUCK': ['FIRE TRUCK', 'FIRET', 'Fire Truck', 'fire', 'FDNY FIRE', 'FDNY TRUCK', 'FDNY'],
    'CHASSIS_CAB': ['Chassis Cab'],
    'TANKER': ['Tanker', 'TANK'],
    'REFRIGERATED_VAN': ['Refrigerated Van'],
    'CONCRETE_MIXER': ['Concrete Mixer'],
    'FLAT_RACK': ['Flat Rack'],
    'ARMORED_TRUCK': ['Armored Truck'],
    'BEVERAGE_TRUCK': ['Beverage Truck'],
    'SCOOTER': ['SCOOTER', 'SCOOT'],
    'LIMO': ['LIMO'],
    'LIFT_BOOM': ['Lift Boom'],
    'TRUCK': ['TRUCK', 'truck'],
    'TRAILER': ['TRAIL', 'trail', 'TRAILER'],
    'STAKE_OR_RACK': ['Stake or Rack'],
    'LUNCH_WAGON': ['Lunch Wagon'],
    'FORKLIFT': ['FORKL'],
    'MOTORIZED_HOME': ['Motorized Home'],
    'PEDICAB': ['Pedicab'],
    'HOPPER': ['Hopper'],
    'MULTI_WHEELED_VEHICLE': ['Multi-Wheeled Vehicle'],
    'USPS': ['USPS'],
    'DELIVERY': ['DELIV', 'DELV'],
    'UTILITY': ['UTILI', 'UTIL'],
    'OPEN_BODY': ['Open Body'],
    'BULK_AGRICULTURE': ['Bulk Agriculture']
}
    

# Replacing values in the "vehicle_type_code_1" column based on the categories
def replace_category(value):
    for category, codes in vehicle_categories.items():
        if value in codes:
            return category
    return 'OTHER'

# apply the created function to the data
vehicle["vehicle_type_code_1"] = vehicle["vehicle_type_code_1"].apply(replace_category)
vehicle["vehicle_type_code_2"] = vehicle["vehicle_type_code_2"].apply(replace_category)
vehicle["vehicle_type_code_3"] = vehicle["vehicle_type_code_3"].apply(replace_category)
vehicle["vehicle_type_code_4"] = vehicle["vehicle_type_code_4"].apply(replace_category)
vehicle["vehicle_type_code_5"] = vehicle["vehicle_type_code_5"].apply(replace_category)


In [13]:
# unique counts in object columns
vehicle.select_dtypes("object").nunique()

borough                               6
location                         271797
on_street_name                    17902
cross_street_name                 19991
off_street_name                  212710
contributing_factor_vehicle_1        16
contributing_factor_vehicle_2        16
contributing_factor_vehicle_3        16
contributing_factor_vehicle_4        14
contributing_factor_vehicle_5        13
vehicle_type_code_1                  50
vehicle_type_code_2                  50
vehicle_type_code_3                  48
vehicle_type_code_4                  38
vehicle_type_code_5                  32
dtype: int64

## Create my target variable

In [14]:
# if any fatalities are found add 1 in the list
fatalities = []
for row in range(len(vehicle)):
    # locate each row and check if any number of fatalities accured
    if vehicle.number_of_persons_killed.iloc[row] != 0:
        fatalities.append(1)
    elif vehicle.number_of_pedestrians_killed.iloc[row] != 0:
        fatalities.append(1)
    elif vehicle.number_of_cyclist_killed.iloc[row] != 0:
        fatalities.append(1)
    elif vehicle.number_of_motorist_killed.iloc[row] != 0:
        fatalities.append(1)
    else:
        fatalities.append(0)

In [15]:
# add target variable to the data
vehicle["fatality"] = fatalities

In [16]:
vehicle.head(1)

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,cross_street_name,off_street_name,...,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,fatality
0,2021-09-11,2023-07-05 02:39:00,UNKNOWN,0,,,UNKNOWN,WHITESTONE EXPRESSWAY,20 AVENUE,UNKNOWN,...,Uncertain or Unspecified Factors,Uncertain or Unspecified Factors,Uncertain or Unspecified Factors,4455765,SEDAN,SEDAN,UNKNOWN,UNKNOWN,UNKNOWN,0


## Remove columns

- remove `location` because it's not tidy and there is also longitude and latitude describing this column.
- remove `collision_id` because it won't help identify my target
- remove `number_of_persons, pedestrians, cyclist, and motorist killed` beacuse they have been moved into the `fatality` variable.

In [17]:
# drop redundent columns
remove_cols = ["location", "collision_id", 
               "number_of_persons_killed",
              "number_of_pedestrians_killed",
              "number_of_cyclist_killed",
              "number_of_motorist_killed"]
vehicle = prepare_.drop_cols(vehicle, remove_cols)

Original dataframe size: (2004974, 30)
New dataframe size: (2004974, 24)


## Encode categorical variable

In [18]:
# get all columns from dataframe
all_columns = vehicle.columns

# containers of different variable types
categorical = []

# separate variables
for col in all_columns:
    # count number of unique valus in the column
    len_of_uniq = len(vehicle[col].unique())
    
    # also checking for only object data types
    if (col != "fatality") and (len_of_uniq <= 50) and (vehicle[col].dtype == "O"):
        categorical.append(col)
    else: pass

In [19]:
categorical

['borough',
 'contributing_factor_vehicle_1',
 'contributing_factor_vehicle_2',
 'contributing_factor_vehicle_3',
 'contributing_factor_vehicle_4',
 'contributing_factor_vehicle_5',
 'vehicle_type_code_1',
 'vehicle_type_code_2',
 'vehicle_type_code_3',
 'vehicle_type_code_4',
 'vehicle_type_code_5']

In [20]:
# create dummies of the categorical columns
dummies = pd.get_dummies(vehicle[categorical])

dummies.head(3)

Unnamed: 0,borough_BRONX,borough_BROOKLYN,borough_MANHATTAN,borough_QUEENS,borough_STATEN ISLAND,borough_UNKNOWN,contributing_factor_vehicle_1_Aggressive Driving and Passenger Distraction,contributing_factor_vehicle_1_Distracted Driving,contributing_factor_vehicle_1_Distractions from Electronic Devices,contributing_factor_vehicle_1_Driver Characteristics and Experience,...,vehicle_type_code_5_STATION_WAGON,vehicle_type_code_5_TANKER,vehicle_type_code_5_TAXI,vehicle_type_code_5_TOW_TRUCK,vehicle_type_code_5_TRACTOR_TRUCK_DIESEL,vehicle_type_code_5_TRACTOR_TRUCK_GASOLINE,vehicle_type_code_5_TRAILER,vehicle_type_code_5_TRUCK,vehicle_type_code_5_UNKNOWN,vehicle_type_code_5_VAN
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [21]:
# renmae the dummie columns
dummies = dummies.rename(columns=lambda x: x.lower().replace("-", "_"))
dummies.head(1)

Unnamed: 0,borough_bronx,borough_brooklyn,borough_manhattan,borough_queens,borough_staten island,borough_unknown,contributing_factor_vehicle_1_aggressive driving and passenger distraction,contributing_factor_vehicle_1_distracted driving,contributing_factor_vehicle_1_distractions from electronic devices,contributing_factor_vehicle_1_driver characteristics and experience,...,vehicle_type_code_5_station_wagon,vehicle_type_code_5_tanker,vehicle_type_code_5_taxi,vehicle_type_code_5_tow_truck,vehicle_type_code_5_tractor_truck_diesel,vehicle_type_code_5_tractor_truck_gasoline,vehicle_type_code_5_trailer,vehicle_type_code_5_truck,vehicle_type_code_5_unknown,vehicle_type_code_5_van
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [22]:
# add dummies to the dataset
vehicle[dummies.columns] = dummies

vehicle.head(1)

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,on_street_name,cross_street_name,off_street_name,number_of_persons_injured,...,vehicle_type_code_5_station_wagon,vehicle_type_code_5_tanker,vehicle_type_code_5_taxi,vehicle_type_code_5_tow_truck,vehicle_type_code_5_tractor_truck_diesel,vehicle_type_code_5_tractor_truck_gasoline,vehicle_type_code_5_trailer,vehicle_type_code_5_truck,vehicle_type_code_5_unknown,vehicle_type_code_5_van
0,2021-09-11,2023-07-05 02:39:00,UNKNOWN,0,,,WHITESTONE EXPRESSWAY,20 AVENUE,UNKNOWN,2,...,0,0,0,0,0,0,0,0,1,0


**Save .csv**

In [23]:
# save the dataframe with dummies in a csv for easy access
vehicle.to_csv("./util_/vehicle_encoded.csv", mode="w")

## Split data

In [24]:
# 60/20/20 split
# split test data
train_validate, test = train_test_split(vehicle, 
                                        test_size=0.2, # 20% in the test set
                                        random_state=95)
# split validate data
train, validate = train_test_split(train_validate, 
                                   test_size=0.2/(1-0.2),  # 20% in the validation set
                                    random_state=95)

In [25]:
(train.shape, validate.shape,test.shape)

((1202984, 323), (400995, 323), (400995, 323))