# Chicago Car Crash 

- read in dataset from https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if
- create a model to predict 'REAR END' type crashes
- utilize RandomForestClassifer model to predict target = 'REAR END'

In [1]:
# Importing libraries 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

from mlxtend.evaluate import feature_importance_permutation

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn

In [2]:
df = pd.read_csv('../data/Traffic_Crashes_-_Crashes.csv')

In [3]:
df

Unnamed: 0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,4fd0a3e0897b3335b94cd8d5b2d2b350eb691add56c62d...,JC343143,,07/10/2019 05:56:00 PM,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,3.0,0.0,17,4,7,41.919664,-87.773288,POINT (-87.773287883007 41.919663832993)
1,009e9e67203442370272e1a13d6ee51a4155dac65e583d...,JA329216,,06/30/2017 04:00:00 PM,35,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,3.0,0.0,16,6,6,41.741804,-87.740954,POINT (-87.740953581987 41.741803598989)
2,ee9283eff3a55ac50ee58f3d9528ce1d689b1c4180b4c4...,JD292400,,07/10/2020 10:25:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,3.0,0.0,10,6,7,41.773456,-87.585022,POINT (-87.585022352022 41.773455972008)
3,f8960f698e870ebdc60b521b2a141a5395556bc3704191...,JD293602,,07/11/2020 01:00:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,PARKED MOTOR VEHICLE,...,0.0,0.0,3.0,0.0,1,7,7,41.802119,-87.622115,POINT (-87.622114914961 41.802118543011)
4,8eaa2678d1a127804ee9b8c35ddf7d63d913c14eda61d6...,JD290451,,07/08/2020 02:00:00 PM,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,...,0.0,0.0,1.0,0.0,14,4,7,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501399,f37d01cdad7c3e9fe01562aa11857405cc2ea38738d8b6...,JE116847,,01/19/2021 07:30:00 AM,15,NO CONTROLS,NO CONTROLS,CLOUDY/OVERCAST,DAYLIGHT,FIXED OBJECT,...,0.0,0.0,2.0,0.0,7,3,1,41.877790,-87.636488,POINT (-87.636487589354 41.877790161619)
501400,f1dba052d8fc8c80d3d693296ff8e0d7cc71d5929677b0...,JE118336,,01/20/2021 05:50:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,...,0.0,0.0,2.0,0.0,17,4,1,41.800697,-87.706358,POINT (-87.706357916164 41.800697419369)
501401,f2962f241a302417913e1b7465e8ae37e6f3161adaa0cf...,JE118196,,01/20/2021 04:15:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,2.0,0.0,16,4,1,41.924709,-87.717371,POINT (-87.717371333013 41.924708659649)
501402,f361f7362d783dc327e5e6b7d2d9bc4bd942284391ad36...,JE118233,,01/20/2021 03:20:00 PM,30,STOP SIGN/FLASHER,NO CONTROLS,CLOUDY/OVERCAST,DAYLIGHT,ANGLE,...,0.0,0.0,4.0,0.0,15,4,1,41.802871,-87.684416,POINT (-87.684416341899 41.802870816721)


In [4]:
# Checking out the dataframe columns
df.columns

Index(['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LA

In [5]:
# filling all na with UNKNOWN string, so we can encode later
df.fillna('UNKOWN', inplace=True)

In [6]:
# dropping the unecessary columns that does not make sense
df.drop(['CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'RD_NO', 'REPORT_TYPE', 'STREET_NO', 'BEAT_OF_OCCURRENCE', 
         'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'WORKERS_PRESENT_I', 'INJURIES_UNKNOWN', 'LONGITUDE', 'LATITUDE',
         'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL','INJURIES_INCAPACITATING', 
         'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'DAMAGE',
         'DATE_POLICE_NOTIFIED', 'CRASH_TYPE','NUM_UNITS','STREET_DIRECTION','STREET_NAME', 'LANE_CNT'], axis=1, inplace=True)

In [7]:
df

Unnamed: 0,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
0,07/10/2019 05:56:00 PM,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,ONE-WAY,STRAIGHT AND LEVEL,DRY,...,UNKOWN,IMPROPER BACKING,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,17,4,7,POINT (-87.773287883007 41.919663832993)
1,06/30/2017 04:00:00 PM,35,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,UNKOWN,FAILING TO YIELD RIGHT-OF-WAY,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,16,6,6,POINT (-87.740953581987 41.741803598989)
2,07/10/2020 10:25:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,...,UNKOWN,FAILING TO YIELD RIGHT-OF-WAY,FAILING TO YIELD RIGHT-OF-WAY,UNKOWN,UNKOWN,UNKOWN,10,6,7,POINT (-87.585022352022 41.773455972008)
3,07/11/2020 01:00:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,...,Y,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,1,7,7,POINT (-87.622114914961 41.802118543011)
4,07/08/2020 02:00:00 PM,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DRIVEWAY,STRAIGHT AND LEVEL,DRY,...,UNKOWN,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,14,4,7,UNKOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501399,01/19/2021 07:30:00 AM,15,NO CONTROLS,NO CONTROLS,CLOUDY/OVERCAST,DAYLIGHT,FIXED OBJECT,PARKING LOT,STRAIGHT AND LEVEL,DRY,...,UNKOWN,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,7,3,1,POINT (-87.636487589354 41.877790161619)
501400,01/20/2021 05:50:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,Y,UNABLE TO DETERMINE,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,17,4,1,POINT (-87.706357916164 41.800697419369)
501401,01/20/2021 04:15:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,...,UNKOWN,FOLLOWING TOO CLOSELY,FAILING TO REDUCE SPEED TO AVOID CRASH,UNKOWN,UNKOWN,UNKOWN,16,4,1,POINT (-87.717371333013 41.924708659649)
501402,01/20/2021 03:20:00 PM,30,STOP SIGN/FLASHER,NO CONTROLS,CLOUDY/OVERCAST,DAYLIGHT,ANGLE,NOT DIVIDED,STRAIGHT AND LEVEL,WET,...,N,FAILING TO YIELD RIGHT-OF-WAY,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,UNKOWN,UNKOWN,UNKOWN,15,4,1,POINT (-87.684416341899 41.802870816721)


In [8]:
# We have speed limits that are not logged correctly, so we will drop them.
# There wasn't a lot so this will not effect our data
list_ = [3, 9, 99, 39, 1, 2, 32, 33, 6, 24, 11, 34, 18, 12, 36, 7, 14, 16, 38, 31, 22, 23, 63, 4, 26]
for n in list_:
    df.drop(index=df[df['POSTED_SPEED_LIMIT'] == n].index, inplace=True)

In [9]:
df

Unnamed: 0,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
0,07/10/2019 05:56:00 PM,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,ONE-WAY,STRAIGHT AND LEVEL,DRY,...,UNKOWN,IMPROPER BACKING,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,17,4,7,POINT (-87.773287883007 41.919663832993)
1,06/30/2017 04:00:00 PM,35,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,UNKOWN,FAILING TO YIELD RIGHT-OF-WAY,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,16,6,6,POINT (-87.740953581987 41.741803598989)
2,07/10/2020 10:25:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,...,UNKOWN,FAILING TO YIELD RIGHT-OF-WAY,FAILING TO YIELD RIGHT-OF-WAY,UNKOWN,UNKOWN,UNKOWN,10,6,7,POINT (-87.585022352022 41.773455972008)
3,07/11/2020 01:00:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,...,Y,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,1,7,7,POINT (-87.622114914961 41.802118543011)
4,07/08/2020 02:00:00 PM,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DRIVEWAY,STRAIGHT AND LEVEL,DRY,...,UNKOWN,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,14,4,7,UNKOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501399,01/19/2021 07:30:00 AM,15,NO CONTROLS,NO CONTROLS,CLOUDY/OVERCAST,DAYLIGHT,FIXED OBJECT,PARKING LOT,STRAIGHT AND LEVEL,DRY,...,UNKOWN,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,7,3,1,POINT (-87.636487589354 41.877790161619)
501400,01/20/2021 05:50:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,Y,UNABLE TO DETERMINE,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,17,4,1,POINT (-87.706357916164 41.800697419369)
501401,01/20/2021 04:15:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,...,UNKOWN,FOLLOWING TOO CLOSELY,FAILING TO REDUCE SPEED TO AVOID CRASH,UNKOWN,UNKOWN,UNKOWN,16,4,1,POINT (-87.717371333013 41.924708659649)
501402,01/20/2021 03:20:00 PM,30,STOP SIGN/FLASHER,NO CONTROLS,CLOUDY/OVERCAST,DAYLIGHT,ANGLE,NOT DIVIDED,STRAIGHT AND LEVEL,WET,...,N,FAILING TO YIELD RIGHT-OF-WAY,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,UNKOWN,UNKOWN,UNKOWN,15,4,1,POINT (-87.684416341899 41.802870816721)


## OneHotEncoding: FIRST_CRASH_TYPE

In [10]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [11]:
# Creating a new dataframe for FIRST_CRASH_TYPE
# Then we will OneHotEncode the data to categories
crash_type = df['FIRST_CRASH_TYPE']
crash_df = pd.DataFrame(crash_type, columns=['FIRST_CRASH_TYPE'])
crash_df = pd.DataFrame(ohe.fit_transform(crash_df[['FIRST_CRASH_TYPE']]).toarray())
crash_df.columns = ohe.get_feature_names(['FIRST_CRASH_TYPE'])

In [12]:
# # Each column will be a FIRST_CRASH_TYPE, so we will need to label each column
# crash_col = df['FIRST_CRASH_TYPE'].unique()
# crash_df.columns = crash_col

In [13]:
crash_df.columns

Index(['FIRST_CRASH_TYPE_ANGLE', 'FIRST_CRASH_TYPE_ANIMAL',
       'FIRST_CRASH_TYPE_FIXED OBJECT', 'FIRST_CRASH_TYPE_HEAD ON',
       'FIRST_CRASH_TYPE_OTHER NONCOLLISION', 'FIRST_CRASH_TYPE_OTHER OBJECT',
       'FIRST_CRASH_TYPE_OVERTURNED', 'FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE',
       'FIRST_CRASH_TYPE_PEDALCYCLIST', 'FIRST_CRASH_TYPE_PEDESTRIAN',
       'FIRST_CRASH_TYPE_REAR END', 'FIRST_CRASH_TYPE_REAR TO FRONT',
       'FIRST_CRASH_TYPE_REAR TO REAR', 'FIRST_CRASH_TYPE_REAR TO SIDE',
       'FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION',
       'FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION', 'FIRST_CRASH_TYPE_TRAIN',
       'FIRST_CRASH_TYPE_TURNING'],
      dtype='object')

In [14]:
# 97,954 "REAR END" type crashes from 300,332 total types of crashes
# very unbalanced 
crash_df['FIRST_CRASH_TYPE_REAR END'].value_counts()

0.0    381954
1.0    118973
Name: FIRST_CRASH_TYPE_REAR END, dtype: int64

In [15]:
print(f'REAR ENDS are : {(97954*100)/300332}% of total crashes')

REAR ENDS are : 32.61523913535687% of total crashes


## LabelEncoding: FEATURES

In [16]:
# Here we select the features we want to use and LabelEncode them using a for loop
# We will also create a new dataframe for them
text_col = ['TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'TRAFFICWAY_TYPE', 
            'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I',
            'HIT_AND_RUN_I', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE','DOORING_I', 'WORK_ZONE_I', 
            'WORK_ZONE_TYPE']

en_df = pd.DataFrame()
for col in text_col:
    encoder = LabelEncoder()
    en_df[col] = encoder.fit_transform(df[col])
    
en_df

Unnamed: 0,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE
0,4,3,2,3,10,3,0,1,1,1,1,21,36,1,1,3
1,15,1,2,3,8,3,0,1,2,1,1,18,26,1,1,3
2,16,1,2,3,6,3,0,1,1,1,1,18,18,1,1,3
3,4,3,2,0,2,3,0,1,1,1,2,36,36,1,1,3
4,4,3,2,3,4,3,0,1,1,1,1,36,36,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500922,4,3,3,3,12,3,0,1,1,1,1,12,26,1,1,3
500923,4,3,2,1,8,3,0,1,1,1,2,36,26,1,1,3
500924,16,1,2,3,8,3,4,1,1,1,1,19,17,1,1,3
500925,15,3,3,3,8,3,6,1,2,1,0,18,12,1,1,3


In [17]:
# Merge the FIRST_CRASH_TYPE and the LabelEncoded dataframe together
df2 = pd.merge(crash_df, en_df, left_index=True, right_index=True)

In [18]:
# Here we have the Features and all Crash types 
df2.head()

Unnamed: 0,FIRST_CRASH_TYPE_ANGLE,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OTHER NONCOLLISION,FIRST_CRASH_TYPE_OTHER OBJECT,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,...,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,1,1,1,21,36,1,1,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,2,1,1,18,26,1,1,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,1,1,1,18,18,1,1,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,1,1,1,2,36,36,1,1,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,1,1,1,1,36,36,1,1,3


## TARGET: Rear End

In [19]:
# Defining our Features and Target
# For X, we want to remove all of the other CRASH types and keep the features.
y = df2['FIRST_CRASH_TYPE_REAR END']
X = df2.drop(columns=crash_df.columns.tolist(), axis=1)

In [20]:
y.value_counts()

0.0    381954
1.0    118973
Name: FIRST_CRASH_TYPE_REAR END, dtype: int64

In [21]:
df2.head()

Unnamed: 0,FIRST_CRASH_TYPE_ANGLE,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OTHER NONCOLLISION,FIRST_CRASH_TYPE_OTHER OBJECT,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,...,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,1,1,1,21,36,1,1,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,2,1,1,18,26,1,1,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,1,1,1,18,18,1,1,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,1,1,1,2,36,36,1,1,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,1,1,1,1,36,36,1,1,3


## Vanilla RandomForestClassifier: Rear End (minor parameter changes)

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
rf.fit(X_train, y_train)

In [None]:
y_test_pred = rf.predict(X_test)

In [None]:
print(f'Accuracy Score Test: {accuracy_score(y_test, y_test_pred)}')
print(f'ROC_AUC Test: {roc_auc_score(y_test, y_test_pred)}')
# ROC_AUC: It tells how much a model is capable of distinguishing between classes.

### Feature Importance: RandomForestClassifier

In [None]:
importances = rf.feature_importances_

#argsort will return sorted indices by sorting array (highest to lowest)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
X.columns[11:13]
#PRIM_CONTRIBUTORY_CAUSE is the most importance feature, provides helpful data.

In [None]:
print(df['PRIM_CONTRIBUTORY_CAUSE'])
print(df['SEC_CONTRIBUTORY_CAUSE'])

## RandomForestClassifier: Hyperparameters (GridSearchCV)

In [None]:
# Here we set a some parameters that the GridSearch will run through.
grid_p = {"n_estimators": [50, 100],
          "criterion": ["gini", "entropy"],
          "max_depth": [4, 6, 10],
          "min_samples_split": [5, 10],
          "min_samples_leaf": [5, 10],
         "max_samples": [400]}

grid_search = GridSearchCV(rf, grid_p, n_jobs=-1, cv=4, scoring='roc_auc')
grid_search.fit(X_train, y_train)

In [None]:
print(f'Best ROC_AUC score: {grid_search.best_score_}\nPrevious score: 0.78')

In [None]:
grid_search.best_params_

In [None]:
grid_p = {"n_estimators": [50, 100],
          "criterion": ["gini", "entropy"],
          "max_depth": [4, 6, 10],
          "min_samples_split": [5, 10],
          "min_samples_leaf": [5, 10],
         "max_samples": [400]}

grid_search = GridSearchCV(rf, grid_p, n_jobs=-1, cv=4, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
print(f'Best Accuracy score: {grid_search.best_score_}\nPrevious score: 0.80')

## Data Visual: Rear End Crashes

In [None]:
# Selecting the columns with only "REAR END" crashes
rear_end = df[df['FIRST_CRASH_TYPE'] == 'REAR END']

In [None]:
# Bar graph of the top 10 PRIMARY_CONTRIBUTORY

plt.figure(figsize=(8,8))

y= rear_end.PRIM_CONTRIBUTORY_CAUSE.value_counts().values[:10]
x=rear_end.PRIM_CONTRIBUTORY_CAUSE.value_counts().index[:10]

sn.barplot(y, x)
plt.title('Common Attributes Of REAR END Crashes', size=20)
plt.ylabel("(Contributory)", size=20, rotation=0)
plt.xlabel("(Quantity)", size=20)
plt.xticks(size=10)
plt.yticks(size=10);