In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Import, explore, and clean Crash Data

In [2]:
#import Crash DataFrame 
crash_df = pd.read_csv('data/Traffic_Crashes_-_Crashes.csv')

In [3]:
crash_df

Unnamed: 0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,0001dc2c34878baec9b7223e7ead101e0487e2e994c977...,JF221668,,04/27/2022 09:30:00 AM,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,1.0,0.0,2.0,0.0,9,4,4,41.926951,-87.661559,POINT (-87.661558949813 41.926951230142)
1,00554edcbf68c6eb4d438e92ce71a593e858971fd885a4...,JF228356,,05/03/2022 06:40:00 AM,15,OTHER REG. SIGN,FUNCTIONING PROPERLY,RAIN,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,...,0.0,0.0,5.0,0.0,6,3,5,41.927526,-87.765423,POINT (-87.765422741103 41.927525861297)
2,01143c127253f877ec850422012fae34b6b0e58bf678ae...,JD337690,,08/19/2020 09:30:00 AM,25,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,0.0,0.0,2.0,0.0,9,4,8,41.932352,-87.688045,POINT (-87.688044974908 41.932351848527)
3,00bf78dfa54ff84306859dc5d220341f1891eaf5fcf6a6...,JF213955,,04/20/2022 04:32:00 PM,15,OTHER,OTHER,CLEAR,DAYLIGHT,REAR TO FRONT,...,0.0,0.0,2.0,0.0,16,4,4,41.794998,-87.622620,POINT (-87.622620128003 41.794997895717)
4,0161c604b1fd2e187d5f4239be87a8b2d8be36b193f01b...,JF221562,Y,04/27/2022 07:00:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,...,0.0,0.0,1.0,0.0,7,4,4,41.752961,-87.550746,POINT (-87.550746027529 41.752960600041)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617341,8f081a009f98c6fe3ffa5968b81d24607b38ef78da4838...,JF224470,,04/29/2022 04:25:00 PM,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,3.0,0.0,16,6,4,41.876198,-87.686155,POINT (-87.686155352745 41.876198079481)
617342,a990787c46a181e9611488f24a7e8453931c9d9057206f...,JF230804,,05/01/2022 09:00:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLOUDY/OVERCAST,DAYLIGHT,REAR END,...,0.0,0.0,2.0,0.0,9,1,5,41.980961,-87.839166,POINT (-87.839165640216 41.980961196676)
617343,cf0c350753ba62e99a06f77de5947b775b9e55917bd66b...,JF230794,,05/05/2022 07:27:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,3.0,0.0,7,5,5,41.814837,-87.743501,POINT (-87.743501480634 41.81483675036)
617344,d33181def1c0c3a57ecabef00a109440b361c01fb392bf...,JF230806,,05/05/2022 07:40:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,2.0,0.0,7,5,5,41.707680,-87.584816,POINT (-87.584816108862 41.707680407853)


In [4]:
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617346 entries, 0 to 617345
Data columns (total 49 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                617346 non-null  object 
 1   RD_NO                          613078 non-null  object 
 2   CRASH_DATE_EST_I               46685 non-null   object 
 3   CRASH_DATE                     617346 non-null  object 
 4   POSTED_SPEED_LIMIT             617346 non-null  int64  
 5   TRAFFIC_CONTROL_DEVICE         617346 non-null  object 
 6   DEVICE_CONDITION               617346 non-null  object 
 7   WEATHER_CONDITION              617346 non-null  object 
 8   LIGHTING_CONDITION             617346 non-null  object 
 9   FIRST_CRASH_TYPE               617346 non-null  object 
 10  TRAFFICWAY_TYPE                617346 non-null  object 
 11  LANE_CNT                       198984 non-null  float64
 12  ALIGNMENT                     

In [5]:
crash_df.describe()

Unnamed: 0,POSTED_SPEED_LIMIT,LANE_CNT,STREET_NO,BEAT_OF_OCCURRENCE,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE
count,617346.0,198984.0,617346.0,617341.0,617346.0,616067.0,616067.0,616067.0,616067.0,616067.0,616067.0,616067.0,617346.0,617346.0,617346.0,613604.0,613604.0
mean,28.346645,13.33148,3678.675046,1238.055031,2.034235,0.182629,0.001156,0.019678,0.102456,0.059339,2.012385,0.0,13.223039,4.127933,6.596502,41.854468,-87.673255
std,6.329037,2961.787,2903.917903,705.722836,0.450624,0.554216,0.036907,0.163796,0.412342,0.312482,1.161061,0.0,5.537447,1.979825,3.450357,0.331937,0.674144
min,0.0,0.0,0.0,111.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-87.936193
25%,30.0,2.0,1230.0,712.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0,4.0,41.78096,-87.721366
50%,30.0,2.0,3200.0,1135.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,4.0,7.0,41.874571,-87.673567
75%,30.0,4.0,5600.0,1822.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,17.0,6.0,10.0,41.923961,-87.632862
max,99.0,1191625.0,451100.0,6100.0,18.0,21.0,4.0,7.0,21.0,15.0,61.0,0.0,23.0,7.0,12.0,42.02278,0.0


In [6]:
#Drop Irrelevant columns 
crash_df.drop(['RD_NO', 'LANE_CNT','TRAFFIC_CONTROL_DEVICE','DEVICE_CONDITION', 'SEC_CONTRIBUTORY_CAUSE', 'CRASH_DATE_EST_I','TRAFFICWAY_TYPE','ALIGNMENT','ROAD_DEFECT','REPORT_TYPE','DATE_POLICE_NOTIFIED','STREET_NO','STREET_DIRECTION','STREET_NAME','PHOTOS_TAKEN_I','STATEMENTS_TAKEN_I','DOORING_I','WORK_ZONE_I','BEAT_OF_OCCURRENCE','WORK_ZONE_TYPE','WORKERS_PRESENT_I','INJURIES_TOTAL','INJURIES_FATAL','INJURIES_REPORTED_NOT_EVIDENT','INJURIES_NON_INCAPACITATING','INJURIES_NO_INDICATION','INJURIES_UNKNOWN','LATITUDE','LONGITUDE','LOCATION'], axis=1, inplace=True)

In [7]:
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617346 entries, 0 to 617345
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CRASH_RECORD_ID          617346 non-null  object 
 1   CRASH_DATE               617346 non-null  object 
 2   POSTED_SPEED_LIMIT       617346 non-null  int64  
 3   WEATHER_CONDITION        617346 non-null  object 
 4   LIGHTING_CONDITION       617346 non-null  object 
 5   FIRST_CRASH_TYPE         617346 non-null  object 
 6   ROADWAY_SURFACE_COND     617346 non-null  object 
 7   CRASH_TYPE               617346 non-null  object 
 8   INTERSECTION_RELATED_I   141110 non-null  object 
 9   NOT_RIGHT_OF_WAY_I       29078 non-null   object 
 10  HIT_AND_RUN_I            189199 non-null  object 
 11  DAMAGE                   617346 non-null  object 
 12  PRIM_CONTRIBUTORY_CAUSE  617346 non-null  object 
 13  NUM_UNITS                617346 non-null  int64  
 14  MOST

In [8]:
#Fill/Drop relevant nulls 
crash_df["INTERSECTION_RELATED_I"].fillna("Unknown", inplace=True)
crash_df["NOT_RIGHT_OF_WAY_I"].fillna("Unknown", inplace=True)
crash_df["HIT_AND_RUN_I"].fillna("Unknown", inplace=True)
crash_df["MOST_SEVERE_INJURY"].fillna("Unknown", inplace=True)
crash_df.dropna(subset=["INJURIES_INCAPACITATING"], inplace=True)

### Import, explore, and clean People DataFrame

In [9]:
#import People DataFrame 
people_df = pd.read_csv('data/Traffic_Crashes_-_People.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
#people_df

In [11]:
#people_df.info()

In [12]:
#Drop irrelevant columns
people_df.drop(['RD_NO', 'CRASH_DATE', 'SEAT_NO','CITY','STATE','ZIPCODE','DRIVERS_LICENSE_STATE','DRIVERS_LICENSE_CLASS','EJECTION','INJURY_CLASSIFICATION','HOSPITAL','EMS_AGENCY','EMS_RUN_NO','PEDPEDAL_ACTION','PEDPEDAL_VISIBILITY','PEDPEDAL_LOCATION','BAC_RESULT','BAC_RESULT VALUE','CELL_PHONE_USE'], axis=1, inplace=True)

In [13]:
#Remove nulls from relevant rows 
people_df.dropna(subset=["VEHICLE_ID"], inplace=True)
people_df.dropna(subset=["SEX"], inplace=True)
people_df.dropna(subset=["SAFETY_EQUIPMENT"], inplace=True)
people_df.dropna(subset=["AIRBAG_DEPLOYED"], inplace=True)
people_df.dropna(subset=["DRIVER_ACTION"], inplace=True)
people_df.dropna(subset=["DRIVER_VISION"], inplace=True)
people_df.dropna(subset=["PHYSICAL_CONDITION"], inplace=True)
people_df.dropna(subset=["AGE"], inplace=True)

In [14]:
people_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 777348 entries, 0 to 1364184
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   PERSON_ID           777348 non-null  object 
 1   PERSON_TYPE         777348 non-null  object 
 2   CRASH_RECORD_ID     777348 non-null  object 
 3   VEHICLE_ID          777348 non-null  float64
 4   SEX                 777348 non-null  object 
 5   AGE                 777348 non-null  float64
 6   SAFETY_EQUIPMENT    777348 non-null  object 
 7   AIRBAG_DEPLOYED     777348 non-null  object 
 8   DRIVER_ACTION       777348 non-null  object 
 9   DRIVER_VISION       777348 non-null  object 
 10  PHYSICAL_CONDITION  777348 non-null  object 
dtypes: float64(2), object(9)
memory usage: 71.2+ MB


### Import, explore, and clean Car DataFrame

In [15]:
car_df = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [16]:
#car_df

In [17]:
#car_df.info()

In [19]:
#Create new Car DataFrame with relevant columns 
clean_car_df = car_df[['CRASH_RECORD_ID','UNIT_TYPE','MAKE','MODEL','VEHICLE_YEAR','VEHICLE_DEFECT','VEHICLE_TYPE','VEHICLE_USE','MANEUVER', 'TOWED_I','EXCEED_SPEED_LIMIT_I']]

In [20]:
#clean_car_df

In [21]:
#clean_car_df.info()

In [22]:
#Drop nulls 
clean_car_df.dropna(subset=["UNIT_TYPE"], inplace=True)
clean_car_df.dropna(subset=["MAKE"], inplace=True)
clean_car_df.dropna(subset=["MODEL"], inplace=True)
clean_car_df.dropna(subset=["VEHICLE_YEAR"], inplace=True)
clean_car_df.dropna(subset=["VEHICLE_DEFECT"], inplace=True)
clean_car_df.dropna(subset=["VEHICLE_USE"], inplace=True)
clean_car_df.dropna(subset=["MANEUVER"], inplace=True)
clean_car_df["TOWED_I"].fillna("Unknown", inplace=True)
clean_car_df["EXCEED_SPEED_LIMIT_I"].fillna("Unknown", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [23]:
clean_car_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1035864 entries, 0 to 1266485
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   CRASH_RECORD_ID       1035864 non-null  object 
 1   UNIT_TYPE             1035864 non-null  object 
 2   MAKE                  1035864 non-null  object 
 3   MODEL                 1035864 non-null  object 
 4   VEHICLE_YEAR          1035864 non-null  float64
 5   VEHICLE_DEFECT        1035864 non-null  object 
 6   VEHICLE_TYPE          1035864 non-null  object 
 7   VEHICLE_USE           1035864 non-null  object 
 8   MANEUVER              1035864 non-null  object 
 9   TOWED_I               1035864 non-null  object 
 10  EXCEED_SPEED_LIMIT_I  1035864 non-null  object 
dtypes: float64(1), object(10)
memory usage: 94.8+ MB


### Merge Crash, People, and Car DataFrames, then explore and clean new DataFrame

In [24]:
#merge crash data and people data 
crash_people_df = pd.merge(crash_df,people_df, how='left',left_on = 'CRASH_RECORD_ID', right_on = "CRASH_RECORD_ID", indicator=True)

#remove duplicates 
crash_people_df.drop_duplicates(subset = 'CRASH_RECORD_ID', inplace = True)

In [25]:
#rename '_merge' column to 'Check', necessary for second merge 
crash_people_df.rename(columns = {'_merge':'Check'}, inplace = True)

In [26]:
#Merge crash, people, and car DataFrames together(CPC) 
cpc_df = pd.merge(crash_people_df, clean_car_df, how='left',left_on = 'CRASH_RECORD_ID', right_on = "CRASH_RECORD_ID", indicator=True)

#Drop duplicates 
cpc_df.drop_duplicates(subset = 'CRASH_RECORD_ID', inplace = True)

In [27]:
pd.set_option('display.max_columns', None)

In [28]:
cpc_df

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,POSTED_SPEED_LIMIT,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,ROADWAY_SURFACE_COND,CRASH_TYPE,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,DAMAGE,PRIM_CONTRIBUTORY_CAUSE,NUM_UNITS,MOST_SEVERE_INJURY,INJURIES_INCAPACITATING,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,PERSON_ID,PERSON_TYPE,VEHICLE_ID,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,Check,UNIT_TYPE,MAKE,MODEL,VEHICLE_YEAR,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,MANEUVER,TOWED_I,EXCEED_SPEED_LIMIT_I,_merge
0,0001dc2c34878baec9b7223e7ead101e0487e2e994c977...,04/27/2022 09:30:00 AM,20,CLEAR,DAYLIGHT,ANGLE,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NONINCAPACITATING INJURY,0.0,9,4,4,O1324770,DRIVER,1258370.0,M,24.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN,UNKNOWN,NORMAL,both,DRIVER,TOYOTA,RAV4,2007.0,UNKNOWN,PASSENGER,PERSONAL,STRAIGHT AHEAD,Unknown,Unknown,both
2,00554edcbf68c6eb4d438e92ce71a593e858971fd885a4...,05/03/2022 06:40:00 AM,15,RAIN,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,WET,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,"OVER $1,500",FAILING TO YIELD RIGHT-OF-WAY,3,NO INDICATION OF INJURY,0.0,6,3,5,O1328703,DRIVER,1262176.0,F,20.0,SAFETY BELT USED,DID NOT DEPLOY,FAILED TO YIELD,UNKNOWN,NORMAL,both,DRIVER,NISSAN,VERSA,2007.0,NONE,PASSENGER,PERSONAL,ENTERING TRAFFIC LANE FROM PARKING,Y,Unknown,both
5,01143c127253f877ec850422012fae34b6b0e58bf678ae...,08/19/2020 09:30:00 AM,25,CLEAR,DAYLIGHT,ANGLE,DRY,NO INJURY / DRIVE AWAY,Unknown,Y,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,9,4,8,O941437,DRIVER,892450.0,M,26.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN,UNKNOWN,UNKNOWN,both,,,,,,,,,,,left_only
6,00bf78dfa54ff84306859dc5d220341f1891eaf5fcf6a6...,04/20/2022 04:32:00 PM,15,CLEAR,DAYLIGHT,REAR TO FRONT,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,16,4,4,O1320543,DRIVER,1254277.0,M,30.0,USAGE UNKNOWN,DID NOT DEPLOY,IMPROPER BACKING,UNKNOWN,UNKNOWN,both,DRIVER,JEEP,COMPASS,2012.0,WINDOWS,SPORT UTILITY VEHICLE (SUV),PERSONAL,BACKING,Unknown,Unknown,both
8,0161c604b1fd2e187d5f4239be87a8b2d8be36b193f01b...,04/27/2022 07:00:00 AM,30,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Y,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,7,4,4,,,,,,,,,,,left_only,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059017,8f081a009f98c6fe3ffa5968b81d24607b38ef78da4838...,04/29/2022 04:25:00 PM,30,CLEAR,DAYLIGHT,REAR END,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Y,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,16,6,4,O1326355,DRIVER,1260025.0,M,34.0,USAGE UNKNOWN,NOT APPLICABLE,NONE,NOT OBSCURED,NORMAL,both,DRIVER,CHEVROLET,EQUINOX,2006.0,UNKNOWN,PASSENGER,PERSONAL,STRAIGHT AHEAD,Y,Unknown,both
1059019,a990787c46a181e9611488f24a7e8453931c9d9057206f...,05/01/2022 09:00:00 AM,30,CLOUDY/OVERCAST,DAYLIGHT,REAR END,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,9,1,5,O1330192,DRIVER,1263607.0,M,36.0,USAGE UNKNOWN,DEPLOYMENT UNKNOWN,OTHER,UNKNOWN,NORMAL,both,DRIVER,MERCEDES-BENZ,OTHER (EXPLAIN IN NARRATIVE),2012.0,UNKNOWN,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,Unknown,Unknown,both
1059021,cf0c350753ba62e99a06f77de5947b775b9e55917bd66b...,05/05/2022 07:27:00 AM,30,CLEAR,DAYLIGHT,TURNING,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,"OVER $1,500",IMPROPER TURNING/NO SIGNAL,2,NO INDICATION OF INJURY,0.0,7,5,5,O1330248,DRIVER,1263658.0,F,40.0,SAFETY BELT USED,NOT APPLICABLE,IMPROPER TURN,NOT OBSCURED,NORMAL,both,DRIVER,HONDA,CR-V,2013.0,NONE,PASSENGER,PERSONAL,TURNING RIGHT,Unknown,Unknown,both
1059023,d33181def1c0c3a57ecabef00a109440b361c01fb392bf...,05/05/2022 07:40:00 AM,30,CLEAR,DAYLIGHT,TURNING,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,"OVER $1,500",IMPROPER TURNING/NO SIGNAL,2,NO INDICATION OF INJURY,0.0,7,5,5,O1330195,DRIVER,1263606.0,M,38.0,USAGE UNKNOWN,NOT APPLICABLE,UNKNOWN,UNKNOWN,NORMAL,both,DRIVER,CHEVROLET,CAMARO,2018.0,UNKNOWN,PASSENGER,PERSONAL,STRAIGHT AHEAD,Unknown,Unknown,both


In [29]:
cpc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 616067 entries, 0 to 1059025
Data columns (total 41 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   CRASH_RECORD_ID          616067 non-null  object  
 1   CRASH_DATE               616067 non-null  object  
 2   POSTED_SPEED_LIMIT       616067 non-null  int64   
 3   WEATHER_CONDITION        616067 non-null  object  
 4   LIGHTING_CONDITION       616067 non-null  object  
 5   FIRST_CRASH_TYPE         616067 non-null  object  
 6   ROADWAY_SURFACE_COND     616067 non-null  object  
 7   CRASH_TYPE               616067 non-null  object  
 8   INTERSECTION_RELATED_I   616067 non-null  object  
 9   NOT_RIGHT_OF_WAY_I       616067 non-null  object  
 10  HIT_AND_RUN_I            616067 non-null  object  
 11  DAMAGE                   616067 non-null  object  
 12  PRIM_CONTRIBUTORY_CAUSE  616067 non-null  object  
 13  NUM_UNITS                616067 non-null  i

In [30]:
#create a new column with only the top 100 makes, and an 'Other' category for all the others 
#chanaged from 100 to 150
TOP_MAKES = cpc_df['MAKE'].value_counts()
threshold = 150
cpc_df['TOP_MAKES'] = np.where(cpc_df['MAKE'].isin(TOP_MAKES.index[TOP_MAKES >= threshold ]), cpc_df['MAKE'], 'other')

In [31]:
#Use map function to create a binary target column 
#helps to create more balanced dataset 
map = {"OVER $1,500":1,"$501 - $1,500": 0, "$500 OR LESS": 0}

cpc_df["Target"] = cpc_df["DAMAGE"].map(map)

In [32]:
#check for balanced dataset 
cpc_df["Target"].value_counts(normalize=True)

1    0.597682
0    0.402318
Name: Target, dtype: float64

In [None]:
#cpc_df.info()

In [33]:
#drop irrelevant columns 
#removed 'maneuvers' from this list (so it is now included in the data)
cpc_df.drop(['PERSON_ID','CRASH_RECORD_ID','DAMAGE','CRASH_DATE','PERSON_TYPE', 'VEHICLE_ID','SAFETY_EQUIPMENT','DRIVER_VISION','Check','_merge','MODEL','MAKE','VEHICLE_DEFECT','VEHICLE_USE','EXCEED_SPEED_LIMIT_I'], axis=1, inplace=True)

In [34]:
#drop nulls 
cpc_df.dropna(subset=["SEX"], inplace=True)
cpc_df.dropna(subset=["VEHICLE_YEAR"], inplace=True)

In [35]:
cpc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481160 entries, 0 to 1059025
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   POSTED_SPEED_LIMIT       481160 non-null  int64  
 1   WEATHER_CONDITION        481160 non-null  object 
 2   LIGHTING_CONDITION       481160 non-null  object 
 3   FIRST_CRASH_TYPE         481160 non-null  object 
 4   ROADWAY_SURFACE_COND     481160 non-null  object 
 5   CRASH_TYPE               481160 non-null  object 
 6   INTERSECTION_RELATED_I   481160 non-null  object 
 7   NOT_RIGHT_OF_WAY_I       481160 non-null  object 
 8   HIT_AND_RUN_I            481160 non-null  object 
 9   PRIM_CONTRIBUTORY_CAUSE  481160 non-null  object 
 10  NUM_UNITS                481160 non-null  int64  
 11  MOST_SEVERE_INJURY       481160 non-null  object 
 12  INJURIES_INCAPACITATING  481160 non-null  float64
 13  CRASH_HOUR               481160 non-null  int64  
 14  CRA

In [84]:
X = cpc_df.drop(columns='Target')
y = cpc_df["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [85]:
#get categorical columns
cat_cols = [c for c in cpc_df.columns if cpc_df[c].dtype == "O"]
cat_cols

['WEATHER_CONDITION',
 'LIGHTING_CONDITION',
 'FIRST_CRASH_TYPE',
 'ROADWAY_SURFACE_COND',
 'CRASH_TYPE',
 'INTERSECTION_RELATED_I',
 'NOT_RIGHT_OF_WAY_I',
 'HIT_AND_RUN_I',
 'PRIM_CONTRIBUTORY_CAUSE',
 'MOST_SEVERE_INJURY',
 'SEX',
 'AIRBAG_DEPLOYED',
 'DRIVER_ACTION',
 'PHYSICAL_CONDITION',
 'UNIT_TYPE',
 'VEHICLE_TYPE',
 'MANEUVER',
 'TOWED_I',
 'TOP_MAKES']

In [86]:
from sklearn.compose import ColumnTransformer

#create encoder object - to help convert cat. variables to new columns
encoder = OneHotEncoder(handle_unknown = 'error',
                       drop = 'first',
                       categories='auto', sparse=False)

#create columntransformer object - to help merge transformed columns
#with the rest of the dataset

ct = ColumnTransformer(transformers=[('ohe', encoder, cat_cols)],
                      remainder='drop')

ct.fit(X_train)
X_train_enc = ct.transform(X_train)
X_test_enc = ct.transform(X_test)

In [87]:
X_train_enc

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [88]:
pd.DataFrame(X_train_enc, columns= ct.get_feature_names_out())

Unnamed: 0,ohe__WEATHER_CONDITION_BLOWING SNOW,ohe__WEATHER_CONDITION_CLEAR,ohe__WEATHER_CONDITION_CLOUDY/OVERCAST,ohe__WEATHER_CONDITION_FOG/SMOKE/HAZE,ohe__WEATHER_CONDITION_FREEZING RAIN/DRIZZLE,ohe__WEATHER_CONDITION_OTHER,ohe__WEATHER_CONDITION_RAIN,ohe__WEATHER_CONDITION_SEVERE CROSS WIND GATE,ohe__WEATHER_CONDITION_SLEET/HAIL,ohe__WEATHER_CONDITION_SNOW,ohe__WEATHER_CONDITION_UNKNOWN,"ohe__LIGHTING_CONDITION_DARKNESS, LIGHTED ROAD",ohe__LIGHTING_CONDITION_DAWN,ohe__LIGHTING_CONDITION_DAYLIGHT,ohe__LIGHTING_CONDITION_DUSK,ohe__LIGHTING_CONDITION_UNKNOWN,ohe__FIRST_CRASH_TYPE_ANIMAL,ohe__FIRST_CRASH_TYPE_FIXED OBJECT,ohe__FIRST_CRASH_TYPE_HEAD ON,ohe__FIRST_CRASH_TYPE_OTHER NONCOLLISION,ohe__FIRST_CRASH_TYPE_OTHER OBJECT,ohe__FIRST_CRASH_TYPE_OVERTURNED,ohe__FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,ohe__FIRST_CRASH_TYPE_PEDALCYCLIST,ohe__FIRST_CRASH_TYPE_PEDESTRIAN,ohe__FIRST_CRASH_TYPE_REAR END,ohe__FIRST_CRASH_TYPE_REAR TO FRONT,ohe__FIRST_CRASH_TYPE_REAR TO REAR,ohe__FIRST_CRASH_TYPE_REAR TO SIDE,ohe__FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,ohe__FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,ohe__FIRST_CRASH_TYPE_TRAIN,ohe__FIRST_CRASH_TYPE_TURNING,ohe__ROADWAY_SURFACE_COND_ICE,ohe__ROADWAY_SURFACE_COND_OTHER,"ohe__ROADWAY_SURFACE_COND_SAND, MUD, DIRT",ohe__ROADWAY_SURFACE_COND_SNOW OR SLUSH,ohe__ROADWAY_SURFACE_COND_UNKNOWN,ohe__ROADWAY_SURFACE_COND_WET,ohe__CRASH_TYPE_NO INJURY / DRIVE AWAY,ohe__INTERSECTION_RELATED_I_Unknown,ohe__INTERSECTION_RELATED_I_Y,ohe__NOT_RIGHT_OF_WAY_I_Unknown,ohe__NOT_RIGHT_OF_WAY_I_Y,ohe__HIT_AND_RUN_I_Unknown,ohe__HIT_AND_RUN_I_Y,ohe__PRIM_CONTRIBUTORY_CAUSE_BICYCLE ADVANCING LEGALLY ON RED LIGHT,ohe__PRIM_CONTRIBUTORY_CAUSE_CELL PHONE USE OTHER THAN TEXTING,ohe__PRIM_CONTRIBUTORY_CAUSE_DISREGARDING OTHER TRAFFIC SIGNS,ohe__PRIM_CONTRIBUTORY_CAUSE_DISREGARDING ROAD MARKINGS,ohe__PRIM_CONTRIBUTORY_CAUSE_DISREGARDING STOP SIGN,ohe__PRIM_CONTRIBUTORY_CAUSE_DISREGARDING TRAFFIC SIGNALS,ohe__PRIM_CONTRIBUTORY_CAUSE_DISREGARDING YIELD SIGN,ohe__PRIM_CONTRIBUTORY_CAUSE_DISTRACTION - FROM INSIDE VEHICLE,ohe__PRIM_CONTRIBUTORY_CAUSE_DISTRACTION - FROM OUTSIDE VEHICLE,"ohe__PRIM_CONTRIBUTORY_CAUSE_DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)",ohe__PRIM_CONTRIBUTORY_CAUSE_DRIVING ON WRONG SIDE/WRONG WAY,ohe__PRIM_CONTRIBUTORY_CAUSE_DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,ohe__PRIM_CONTRIBUTORY_CAUSE_EQUIPMENT - VEHICLE CONDITION,"ohe__PRIM_CONTRIBUTORY_CAUSE_EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST",ohe__PRIM_CONTRIBUTORY_CAUSE_EXCEEDING AUTHORIZED SPEED LIMIT,ohe__PRIM_CONTRIBUTORY_CAUSE_EXCEEDING SAFE SPEED FOR CONDITIONS,ohe__PRIM_CONTRIBUTORY_CAUSE_FAILING TO REDUCE SPEED TO AVOID CRASH,ohe__PRIM_CONTRIBUTORY_CAUSE_FAILING TO YIELD RIGHT-OF-WAY,ohe__PRIM_CONTRIBUTORY_CAUSE_FOLLOWING TOO CLOSELY,ohe__PRIM_CONTRIBUTORY_CAUSE_HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE),ohe__PRIM_CONTRIBUTORY_CAUSE_IMPROPER BACKING,ohe__PRIM_CONTRIBUTORY_CAUSE_IMPROPER LANE USAGE,ohe__PRIM_CONTRIBUTORY_CAUSE_IMPROPER OVERTAKING/PASSING,ohe__PRIM_CONTRIBUTORY_CAUSE_IMPROPER TURNING/NO SIGNAL,ohe__PRIM_CONTRIBUTORY_CAUSE_MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT,ohe__PRIM_CONTRIBUTORY_CAUSE_NOT APPLICABLE,ohe__PRIM_CONTRIBUTORY_CAUSE_OBSTRUCTED CROSSWALKS,"ohe__PRIM_CONTRIBUTORY_CAUSE_OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER",ohe__PRIM_CONTRIBUTORY_CAUSE_PASSING STOPPED SCHOOL BUS,ohe__PRIM_CONTRIBUTORY_CAUSE_PHYSICAL CONDITION OF DRIVER,ohe__PRIM_CONTRIBUTORY_CAUSE_RELATED TO BUS STOP,ohe__PRIM_CONTRIBUTORY_CAUSE_ROAD CONSTRUCTION/MAINTENANCE,ohe__PRIM_CONTRIBUTORY_CAUSE_ROAD ENGINEERING/SURFACE/MARKING DEFECTS,ohe__PRIM_CONTRIBUTORY_CAUSE_TEXTING,ohe__PRIM_CONTRIBUTORY_CAUSE_TURNING RIGHT ON RED,ohe__PRIM_CONTRIBUTORY_CAUSE_UNABLE TO DETERMINE,ohe__PRIM_CONTRIBUTORY_CAUSE_UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED),"ohe__PRIM_CONTRIBUTORY_CAUSE_VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)",ohe__PRIM_CONTRIBUTORY_CAUSE_WEATHER,ohe__MOST_SEVERE_INJURY_INCAPACITATING INJURY,ohe__MOST_SEVERE_INJURY_NO INDICATION OF INJURY,ohe__MOST_SEVERE_INJURY_NONINCAPACITATING INJURY,"ohe__MOST_SEVERE_INJURY_REPORTED, NOT EVIDENT",ohe__SEX_M,ohe__SEX_X,"ohe__AIRBAG_DEPLOYED_DEPLOYED, COMBINATION","ohe__AIRBAG_DEPLOYED_DEPLOYED, FRONT","ohe__AIRBAG_DEPLOYED_DEPLOYED, SIDE",ohe__AIRBAG_DEPLOYED_DEPLOYMENT UNKNOWN,ohe__AIRBAG_DEPLOYED_DID NOT DEPLOY,ohe__AIRBAG_DEPLOYED_NOT APPLICABLE,ohe__DRIVER_ACTION_DISREGARDED CONTROL DEVICES,ohe__DRIVER_ACTION_EMERGENCY VEHICLE ON CALL,ohe__DRIVER_ACTION_EVADING POLICE VEHICLE,ohe__DRIVER_ACTION_FAILED TO YIELD,ohe__DRIVER_ACTION_FOLLOWED TOO CLOSELY,ohe__DRIVER_ACTION_IMPROPER BACKING,ohe__DRIVER_ACTION_IMPROPER LANE CHANGE,ohe__DRIVER_ACTION_IMPROPER PARKING,ohe__DRIVER_ACTION_IMPROPER PASSING,ohe__DRIVER_ACTION_IMPROPER TURN,ohe__DRIVER_ACTION_LICENSE RESTRICTIONS,ohe__DRIVER_ACTION_NONE,ohe__DRIVER_ACTION_OTHER,ohe__DRIVER_ACTION_OVERCORRECTED,ohe__DRIVER_ACTION_STOPPED SCHOOL BUS,ohe__DRIVER_ACTION_TEXTING,ohe__DRIVER_ACTION_TOO FAST FOR CONDITIONS,ohe__DRIVER_ACTION_UNKNOWN,ohe__DRIVER_ACTION_WRONG WAY/SIDE,ohe__PHYSICAL_CONDITION_FATIGUED/ASLEEP,ohe__PHYSICAL_CONDITION_HAD BEEN DRINKING,ohe__PHYSICAL_CONDITION_ILLNESS/FAINTED,ohe__PHYSICAL_CONDITION_IMPAIRED - ALCOHOL,ohe__PHYSICAL_CONDITION_IMPAIRED - ALCOHOL AND DRUGS,ohe__PHYSICAL_CONDITION_IMPAIRED - DRUGS,ohe__PHYSICAL_CONDITION_MEDICATED,ohe__PHYSICAL_CONDITION_NORMAL,ohe__PHYSICAL_CONDITION_OTHER,ohe__PHYSICAL_CONDITION_REMOVED BY EMS,ohe__PHYSICAL_CONDITION_UNKNOWN,ohe__UNIT_TYPE_DRIVER,ohe__UNIT_TYPE_DRIVERLESS,ohe__UNIT_TYPE_NON-CONTACT VEHICLE,ohe__UNIT_TYPE_PARKED,ohe__VEHICLE_TYPE_ALL-TERRAIN VEHICLE (ATV),ohe__VEHICLE_TYPE_AUTOCYCLE,ohe__VEHICLE_TYPE_BUS OVER 15 PASS.,ohe__VEHICLE_TYPE_BUS UP TO 15 PASS.,ohe__VEHICLE_TYPE_FARM EQUIPMENT,ohe__VEHICLE_TYPE_MOPED OR MOTORIZED BICYCLE,ohe__VEHICLE_TYPE_MOTOR DRIVEN CYCLE,ohe__VEHICLE_TYPE_MOTORCYCLE (OVER 150CC),ohe__VEHICLE_TYPE_OTHER,ohe__VEHICLE_TYPE_OTHER VEHICLE WITH TRAILER,ohe__VEHICLE_TYPE_PASSENGER,ohe__VEHICLE_TYPE_PICKUP,ohe__VEHICLE_TYPE_RECREATIONAL OFF-HIGHWAY VEHICLE (ROV),ohe__VEHICLE_TYPE_SINGLE UNIT TRUCK WITH TRAILER,ohe__VEHICLE_TYPE_SPORT UTILITY VEHICLE (SUV),ohe__VEHICLE_TYPE_TRACTOR W/ SEMI-TRAILER,ohe__VEHICLE_TYPE_TRACTOR W/O SEMI-TRAILER,ohe__VEHICLE_TYPE_TRUCK - SINGLE UNIT,ohe__VEHICLE_TYPE_UNKNOWN/NA,ohe__VEHICLE_TYPE_VAN/MINI-VAN,ohe__MANEUVER_BACKING,ohe__MANEUVER_CHANGING LANES,ohe__MANEUVER_DISABLED,ohe__MANEUVER_DIVERGING,ohe__MANEUVER_DRIVERLESS,ohe__MANEUVER_DRIVING WRONG WAY,ohe__MANEUVER_ENTER FROM DRIVE/ALLEY,ohe__MANEUVER_ENTERING TRAFFIC LANE FROM PARKING,ohe__MANEUVER_LEAVING TRAFFIC LANE TO PARK,ohe__MANEUVER_MERGING,ohe__MANEUVER_NEGOTIATING A CURVE,ohe__MANEUVER_OTHER,ohe__MANEUVER_PARKED,ohe__MANEUVER_PARKED IN TRAFFIC LANE,ohe__MANEUVER_PASSING/OVERTAKING,ohe__MANEUVER_SKIDDING/CONTROL LOSS,ohe__MANEUVER_SLOW/STOP - LEFT TURN,ohe__MANEUVER_SLOW/STOP - LOAD/UNLOAD,ohe__MANEUVER_SLOW/STOP - RIGHT TURN,ohe__MANEUVER_SLOW/STOP IN TRAFFIC,ohe__MANEUVER_STARTING IN TRAFFIC,ohe__MANEUVER_STRAIGHT AHEAD,ohe__MANEUVER_TURNING LEFT,ohe__MANEUVER_TURNING ON RED,ohe__MANEUVER_TURNING RIGHT,ohe__MANEUVER_U-TURN,ohe__MANEUVER_UNKNOWN/NA,ohe__TOWED_I_Unknown,ohe__TOWED_I_Y,ohe__TOP_MAKES_ACURA (DIV. OF AMERICAN HONDA MOTOR CO.),ohe__TOP_MAKES_AUDI,ohe__TOP_MAKES_AUTOCAR,ohe__TOP_MAKES_BLUE BIRD BODY CO.,ohe__TOP_MAKES_BLUEBIRD INTERNATIONAL,ohe__TOP_MAKES_BMW,ohe__TOP_MAKES_BUICK,ohe__TOP_MAKES_CADILLAC,ohe__TOP_MAKES_CHEVROLET,ohe__TOP_MAKES_CHRYSLER,ohe__TOP_MAKES_DODGE,ohe__TOP_MAKES_FIAT,ohe__TOP_MAKES_FORD,ohe__TOP_MAKES_FREIGHTLINER CORP.,ohe__TOP_MAKES_FREIGHTLINER CORPORATION,ohe__TOP_MAKES_GENERAL MOTORS CORP.,ohe__TOP_MAKES_GENERAL MOTORS CORPORATION (GMC),ohe__TOP_MAKES_HARLEY-DAVIDSON,ohe__TOP_MAKES_HINO,ohe__TOP_MAKES_HONDA,ohe__TOP_MAKES_HYUNDAI,ohe__TOP_MAKES_INFINITI,ohe__TOP_MAKES_INTERNATIONAL COACH MFG.,ohe__TOP_MAKES_INTERNATIONAL HARVESTER,ohe__TOP_MAKES_INTERNATIONAL TRAILER CORP.,ohe__TOP_MAKES_INTERNATIONAL TRAILER CORPORATION,ohe__TOP_MAKES_ISUZU,ohe__TOP_MAKES_JAGUAR,ohe__TOP_MAKES_JEEP,ohe__TOP_MAKES_KENWORTH MOTOR TRUCK CO.,ohe__TOP_MAKES_KENWORTH MOTOR TRUCK COMPANY,ohe__TOP_MAKES_KIA,ohe__TOP_MAKES_KIA MOTORS CORP,ohe__TOP_MAKES_LAND ROVER,ohe__TOP_MAKES_LEXUS,ohe__TOP_MAKES_LINCOLN,ohe__TOP_MAKES_LINCOLN-CONTINENTAL,"ohe__TOP_MAKES_MACK TRUCKS, INC.",ohe__TOP_MAKES_MAZDA,ohe__TOP_MAKES_MERCEDES-BENZ,ohe__TOP_MAKES_MERCURY,ohe__TOP_MAKES_MINI,ohe__TOP_MAKES_MITSUBISHI,ohe__TOP_MAKES_NEW FLYER,ohe__TOP_MAKES_NISSAN,ohe__TOP_MAKES_NOVA BUS,ohe__TOP_MAKES_OLDSMOBILE,"ohe__TOP_MAKES_PETERBILT MOTORS CO., DIVISION PACCAR, INC.","ohe__TOP_MAKES_PETERBILT MOTORS COMPANY (DIVISION OF PACCAR, INC.)",ohe__TOP_MAKES_PLYMOUTH,ohe__TOP_MAKES_PONTIAC,ohe__TOP_MAKES_PORSCHE,ohe__TOP_MAKES_RANGE ROVER OF NORTH AMERICA,ohe__TOP_MAKES_SAAB,ohe__TOP_MAKES_SATURN,ohe__TOP_MAKES_SCION,ohe__TOP_MAKES_STERLING,ohe__TOP_MAKES_SUBARU,ohe__TOP_MAKES_SUZUKI,ohe__TOP_MAKES_TESLA,ohe__TOP_MAKES_TESLA MOTORS,ohe__TOP_MAKES_THOMAS BUILT BUS CO.,ohe__TOP_MAKES_TOYOTA,"ohe__TOP_MAKES_TOYOTA MOTOR COMPANY, LTD.",ohe__TOP_MAKES_UNKNOWN,ohe__TOP_MAKES_VOLKSWAGEN,ohe__TOP_MAKES_VOLVO,ohe__TOP_MAKES_YAMAHA,ohe__TOP_MAKES_other
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384923,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
384924,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
384925,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384926,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### Now lets scale 

In [89]:
#import sclaer 
from sklearn.preprocessing import StandardScaler 
#instantiate our scalar
scaler = StandardScaler()

#train on train data
scaler.fit(X_train_enc)

#transform both train and test data
X_train_scaled = scaler.transform(X_train_enc)
X_test_scaled = scaler.transform(X_test_enc)

In [90]:
pd.DataFrame(X_train_enc, columns=ct.get_feature_names())



Unnamed: 0,ohe__x0_BLOWING SNOW,ohe__x0_CLEAR,ohe__x0_CLOUDY/OVERCAST,ohe__x0_FOG/SMOKE/HAZE,ohe__x0_FREEZING RAIN/DRIZZLE,ohe__x0_OTHER,ohe__x0_RAIN,ohe__x0_SEVERE CROSS WIND GATE,ohe__x0_SLEET/HAIL,ohe__x0_SNOW,ohe__x0_UNKNOWN,"ohe__x1_DARKNESS, LIGHTED ROAD",ohe__x1_DAWN,ohe__x1_DAYLIGHT,ohe__x1_DUSK,ohe__x1_UNKNOWN,ohe__x2_ANIMAL,ohe__x2_FIXED OBJECT,ohe__x2_HEAD ON,ohe__x2_OTHER NONCOLLISION,ohe__x2_OTHER OBJECT,ohe__x2_OVERTURNED,ohe__x2_PARKED MOTOR VEHICLE,ohe__x2_PEDALCYCLIST,ohe__x2_PEDESTRIAN,ohe__x2_REAR END,ohe__x2_REAR TO FRONT,ohe__x2_REAR TO REAR,ohe__x2_REAR TO SIDE,ohe__x2_SIDESWIPE OPPOSITE DIRECTION,ohe__x2_SIDESWIPE SAME DIRECTION,ohe__x2_TRAIN,ohe__x2_TURNING,ohe__x3_ICE,ohe__x3_OTHER,"ohe__x3_SAND, MUD, DIRT",ohe__x3_SNOW OR SLUSH,ohe__x3_UNKNOWN,ohe__x3_WET,ohe__x4_NO INJURY / DRIVE AWAY,ohe__x5_Unknown,ohe__x5_Y,ohe__x6_Unknown,ohe__x6_Y,ohe__x7_Unknown,ohe__x7_Y,ohe__x8_BICYCLE ADVANCING LEGALLY ON RED LIGHT,ohe__x8_CELL PHONE USE OTHER THAN TEXTING,ohe__x8_DISREGARDING OTHER TRAFFIC SIGNS,ohe__x8_DISREGARDING ROAD MARKINGS,ohe__x8_DISREGARDING STOP SIGN,ohe__x8_DISREGARDING TRAFFIC SIGNALS,ohe__x8_DISREGARDING YIELD SIGN,ohe__x8_DISTRACTION - FROM INSIDE VEHICLE,ohe__x8_DISTRACTION - FROM OUTSIDE VEHICLE,"ohe__x8_DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)",ohe__x8_DRIVING ON WRONG SIDE/WRONG WAY,ohe__x8_DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,ohe__x8_EQUIPMENT - VEHICLE CONDITION,"ohe__x8_EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST",ohe__x8_EXCEEDING AUTHORIZED SPEED LIMIT,ohe__x8_EXCEEDING SAFE SPEED FOR CONDITIONS,ohe__x8_FAILING TO REDUCE SPEED TO AVOID CRASH,ohe__x8_FAILING TO YIELD RIGHT-OF-WAY,ohe__x8_FOLLOWING TOO CLOSELY,ohe__x8_HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE),ohe__x8_IMPROPER BACKING,ohe__x8_IMPROPER LANE USAGE,ohe__x8_IMPROPER OVERTAKING/PASSING,ohe__x8_IMPROPER TURNING/NO SIGNAL,ohe__x8_MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT,ohe__x8_NOT APPLICABLE,ohe__x8_OBSTRUCTED CROSSWALKS,"ohe__x8_OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER",ohe__x8_PASSING STOPPED SCHOOL BUS,ohe__x8_PHYSICAL CONDITION OF DRIVER,ohe__x8_RELATED TO BUS STOP,ohe__x8_ROAD CONSTRUCTION/MAINTENANCE,ohe__x8_ROAD ENGINEERING/SURFACE/MARKING DEFECTS,ohe__x8_TEXTING,ohe__x8_TURNING RIGHT ON RED,ohe__x8_UNABLE TO DETERMINE,ohe__x8_UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED),"ohe__x8_VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)",ohe__x8_WEATHER,ohe__x9_INCAPACITATING INJURY,ohe__x9_NO INDICATION OF INJURY,ohe__x9_NONINCAPACITATING INJURY,"ohe__x9_REPORTED, NOT EVIDENT",ohe__x10_M,ohe__x10_X,"ohe__x11_DEPLOYED, COMBINATION","ohe__x11_DEPLOYED, FRONT","ohe__x11_DEPLOYED, SIDE",ohe__x11_DEPLOYMENT UNKNOWN,ohe__x11_DID NOT DEPLOY,ohe__x11_NOT APPLICABLE,ohe__x12_DISREGARDED CONTROL DEVICES,ohe__x12_EMERGENCY VEHICLE ON CALL,ohe__x12_EVADING POLICE VEHICLE,ohe__x12_FAILED TO YIELD,ohe__x12_FOLLOWED TOO CLOSELY,ohe__x12_IMPROPER BACKING,ohe__x12_IMPROPER LANE CHANGE,ohe__x12_IMPROPER PARKING,ohe__x12_IMPROPER PASSING,ohe__x12_IMPROPER TURN,ohe__x12_LICENSE RESTRICTIONS,ohe__x12_NONE,ohe__x12_OTHER,ohe__x12_OVERCORRECTED,ohe__x12_STOPPED SCHOOL BUS,ohe__x12_TEXTING,ohe__x12_TOO FAST FOR CONDITIONS,ohe__x12_UNKNOWN,ohe__x12_WRONG WAY/SIDE,ohe__x13_FATIGUED/ASLEEP,ohe__x13_HAD BEEN DRINKING,ohe__x13_ILLNESS/FAINTED,ohe__x13_IMPAIRED - ALCOHOL,ohe__x13_IMPAIRED - ALCOHOL AND DRUGS,ohe__x13_IMPAIRED - DRUGS,ohe__x13_MEDICATED,ohe__x13_NORMAL,ohe__x13_OTHER,ohe__x13_REMOVED BY EMS,ohe__x13_UNKNOWN,ohe__x14_DRIVER,ohe__x14_DRIVERLESS,ohe__x14_NON-CONTACT VEHICLE,ohe__x14_PARKED,ohe__x15_ALL-TERRAIN VEHICLE (ATV),ohe__x15_AUTOCYCLE,ohe__x15_BUS OVER 15 PASS.,ohe__x15_BUS UP TO 15 PASS.,ohe__x15_FARM EQUIPMENT,ohe__x15_MOPED OR MOTORIZED BICYCLE,ohe__x15_MOTOR DRIVEN CYCLE,ohe__x15_MOTORCYCLE (OVER 150CC),ohe__x15_OTHER,ohe__x15_OTHER VEHICLE WITH TRAILER,ohe__x15_PASSENGER,ohe__x15_PICKUP,ohe__x15_RECREATIONAL OFF-HIGHWAY VEHICLE (ROV),ohe__x15_SINGLE UNIT TRUCK WITH TRAILER,ohe__x15_SPORT UTILITY VEHICLE (SUV),ohe__x15_TRACTOR W/ SEMI-TRAILER,ohe__x15_TRACTOR W/O SEMI-TRAILER,ohe__x15_TRUCK - SINGLE UNIT,ohe__x15_UNKNOWN/NA,ohe__x15_VAN/MINI-VAN,ohe__x16_BACKING,ohe__x16_CHANGING LANES,ohe__x16_DISABLED,ohe__x16_DIVERGING,ohe__x16_DRIVERLESS,ohe__x16_DRIVING WRONG WAY,ohe__x16_ENTER FROM DRIVE/ALLEY,ohe__x16_ENTERING TRAFFIC LANE FROM PARKING,ohe__x16_LEAVING TRAFFIC LANE TO PARK,ohe__x16_MERGING,ohe__x16_NEGOTIATING A CURVE,ohe__x16_OTHER,ohe__x16_PARKED,ohe__x16_PARKED IN TRAFFIC LANE,ohe__x16_PASSING/OVERTAKING,ohe__x16_SKIDDING/CONTROL LOSS,ohe__x16_SLOW/STOP - LEFT TURN,ohe__x16_SLOW/STOP - LOAD/UNLOAD,ohe__x16_SLOW/STOP - RIGHT TURN,ohe__x16_SLOW/STOP IN TRAFFIC,ohe__x16_STARTING IN TRAFFIC,ohe__x16_STRAIGHT AHEAD,ohe__x16_TURNING LEFT,ohe__x16_TURNING ON RED,ohe__x16_TURNING RIGHT,ohe__x16_U-TURN,ohe__x16_UNKNOWN/NA,ohe__x17_Unknown,ohe__x17_Y,ohe__x18_ACURA (DIV. OF AMERICAN HONDA MOTOR CO.),ohe__x18_AUDI,ohe__x18_AUTOCAR,ohe__x18_BLUE BIRD BODY CO.,ohe__x18_BLUEBIRD INTERNATIONAL,ohe__x18_BMW,ohe__x18_BUICK,ohe__x18_CADILLAC,ohe__x18_CHEVROLET,ohe__x18_CHRYSLER,ohe__x18_DODGE,ohe__x18_FIAT,ohe__x18_FORD,ohe__x18_FREIGHTLINER CORP.,ohe__x18_FREIGHTLINER CORPORATION,ohe__x18_GENERAL MOTORS CORP.,ohe__x18_GENERAL MOTORS CORPORATION (GMC),ohe__x18_HARLEY-DAVIDSON,ohe__x18_HINO,ohe__x18_HONDA,ohe__x18_HYUNDAI,ohe__x18_INFINITI,ohe__x18_INTERNATIONAL COACH MFG.,ohe__x18_INTERNATIONAL HARVESTER,ohe__x18_INTERNATIONAL TRAILER CORP.,ohe__x18_INTERNATIONAL TRAILER CORPORATION,ohe__x18_ISUZU,ohe__x18_JAGUAR,ohe__x18_JEEP,ohe__x18_KENWORTH MOTOR TRUCK CO.,ohe__x18_KENWORTH MOTOR TRUCK COMPANY,ohe__x18_KIA,ohe__x18_KIA MOTORS CORP,ohe__x18_LAND ROVER,ohe__x18_LEXUS,ohe__x18_LINCOLN,ohe__x18_LINCOLN-CONTINENTAL,"ohe__x18_MACK TRUCKS, INC.",ohe__x18_MAZDA,ohe__x18_MERCEDES-BENZ,ohe__x18_MERCURY,ohe__x18_MINI,ohe__x18_MITSUBISHI,ohe__x18_NEW FLYER,ohe__x18_NISSAN,ohe__x18_NOVA BUS,ohe__x18_OLDSMOBILE,"ohe__x18_PETERBILT MOTORS CO., DIVISION PACCAR, INC.","ohe__x18_PETERBILT MOTORS COMPANY (DIVISION OF PACCAR, INC.)",ohe__x18_PLYMOUTH,ohe__x18_PONTIAC,ohe__x18_PORSCHE,ohe__x18_RANGE ROVER OF NORTH AMERICA,ohe__x18_SAAB,ohe__x18_SATURN,ohe__x18_SCION,ohe__x18_STERLING,ohe__x18_SUBARU,ohe__x18_SUZUKI,ohe__x18_TESLA,ohe__x18_TESLA MOTORS,ohe__x18_THOMAS BUILT BUS CO.,ohe__x18_TOYOTA,"ohe__x18_TOYOTA MOTOR COMPANY, LTD.",ohe__x18_UNKNOWN,ohe__x18_VOLKSWAGEN,ohe__x18_VOLVO,ohe__x18_YAMAHA,ohe__x18_other
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384923,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
384924,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
384925,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384926,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### Create Model 

In [91]:
#create baseline model 
baseline_model = LogisticRegression(random_state=42)

# Use cross_val_score with scoring="neg_log_loss" to evaluate the model
# on X_train and y_train
baseline_neg_log_loss_cv = cross_val_score(baseline_model, X_train, y_train, scoring="neg_log_loss")

baseline_log_loss = -(baseline_neg_log_loss_cv.mean())
baseline_log_loss

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/bryankeating/miniconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bryankeating/miniconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1508, in fit
    X, y = self._validate_data(
  File "/Users/bryankeating/miniconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/bryankeating/miniconda3/envs/learn

nan

In [77]:
#see what our log-loss would be if our model choose the majority class 
log_loss(y_train, np.zeros(len(y_train)))

In [79]:
#evaluate 
from sklearn.metrics import r2_score

r2_score(y_train, train_preds)


0.138536904607646

In [80]:
r2_score(y_train, train_preds)

0.138536904607646

In [83]:
#visualize our residuals
import matplotlib.pyplot as plt 
import seaborn as sns
from yellowbrick.regressor import ResidualsPlot

import yellowbrick 
visualizer = ResidualsPlot(lr)

visualizer.fit(X_train_scaled, y_train)
visualizer.score(X_test_scaled, y_test)
visualizer.show()
plt.show() 

ModuleNotFoundError: No module named 'yellowbrick'

### Create numeric feature DF for DecisionTreeClassifier 

In [41]:
#create new DF with just our numeric values
numeric_df = cpc_df[['POSTED_SPEED_LIMIT','NUM_UNITS','INJURIES_INCAPACITATING','CRASH_HOUR','CRASH_DAY_OF_WEEK','CRASH_MONTH','AGE','VEHICLE_YEAR', 'Target']]

In [42]:
numeric_df

Unnamed: 0,POSTED_SPEED_LIMIT,NUM_UNITS,INJURIES_INCAPACITATING,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,AGE,VEHICLE_YEAR,Target
0,20,2,0.0,9,4,4,24.0,2007.0,1
2,15,3,0.0,6,3,5,20.0,2007.0,1
6,15,2,0.0,16,4,4,30.0,2012.0,1
10,30,2,0.0,2,3,5,41.0,2013.0,1
11,30,2,0.0,15,4,4,57.0,2012.0,1
...,...,...,...,...,...,...,...,...,...
1059017,30,2,0.0,16,6,4,34.0,2006.0,1
1059019,30,2,0.0,9,1,5,36.0,2012.0,1
1059021,30,2,0.0,7,5,5,40.0,2013.0,1
1059023,30,2,0.0,7,5,5,38.0,2018.0,1


In [43]:
cpc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481160 entries, 0 to 1059025
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   POSTED_SPEED_LIMIT       481160 non-null  int64  
 1   WEATHER_CONDITION        481160 non-null  object 
 2   LIGHTING_CONDITION       481160 non-null  object 
 3   FIRST_CRASH_TYPE         481160 non-null  object 
 4   ROADWAY_SURFACE_COND     481160 non-null  object 
 5   CRASH_TYPE               481160 non-null  object 
 6   INTERSECTION_RELATED_I   481160 non-null  object 
 7   NOT_RIGHT_OF_WAY_I       481160 non-null  object 
 8   HIT_AND_RUN_I            481160 non-null  object 
 9   PRIM_CONTRIBUTORY_CAUSE  481160 non-null  object 
 10  NUM_UNITS                481160 non-null  int64  
 11  MOST_SEVERE_INJURY       481160 non-null  object 
 12  INJURIES_INCAPACITATING  481160 non-null  float64
 13  CRASH_HOUR               481160 non-null  int64  
 14  CRA

#### Train Test Split for numeric features 

In [44]:
# Split df into X and y
X = numeric_df.drop("Target", axis=1)
y = numeric_df["Target"]

# Perform train-test split with random_state=42 and stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [45]:
#assert X and Y have same number of rows and columns 
assert X_train.shape[0] == y_train.shape[0] 

# X and y testing data should have the same number of rows
assert X_test.shape[0] == y_test.shape[0] 

# Both X should have 52 columns
assert X_train.shape[1] == X_test.shape[1] 

# Both y should have 1 column
assert len(y_train.shape) == len(y_test.shape)

#### Decision Tree Classifier 

In [46]:
#Decision Tree Classifier 
numeric_tree_1 = DecisionTreeClassifier(max_depth=8, random_state=42)

numeric_tree_1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, random_state=42)

In [47]:
#accuracy on training data
numeric_tree_1.score(X_train, y_train)

0.6097846870064012

In [48]:
# Accuracy on test data
numeric_tree_1.score(X_test, y_test)

0.6076897497713858

In [49]:
# Allow it to run the full default hyperparameters
numeric_tree_1 = DecisionTreeClassifier(random_state=42)
numeric_tree_1.fit(X_train, y_train)

# Accuracy on training data & test data
print('Training:', numeric_tree_1.score(X_train, y_train))
print('Testing:', numeric_tree_1.score(X_test, y_test))

Training: 0.9762601490841577
Testing: 0.5350236927425389


In [50]:
# Add in a stop function - min_impurity_decrease
numeric_tree_1 = DecisionTreeClassifier(min_impurity_decrease=0.3, random_state=42)
numeric_tree_1.fit(X_train, y_train)

# Accuracy on training data & test data
print('Training:', numeric_tree_1.score(X_train, y_train))
print('Testing:', numeric_tree_1.score(X_test, y_test))

Training: 0.6061878238700917
Testing: 0.6061850527890931


In [52]:
feature_used = ['POSTED_SPEED_LIMIT','NUM_UNITS','INJURIES_INCAPACITATING','CRASH_HOUR','CRASH_DAY_OF_WEEK','CRASH_MONTH','AGE','VEHICLE_YEAR'] 
X = numeric_df.drop("Target", axis=1)
y = numeric_df["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

X_train.shape, X_test.shape

((384928, 8), (96232, 8))

In [53]:
numeric_tree_1 = DecisionTreeClassifier(max_depth = 10, random_state=42)

numeric_tree_1.fit(X, y)

for fi, feature in zip(numeric_tree_1.feature_importances_, feature_used):
    print(fi, feature)

0.12438274228250057 POSTED_SPEED_LIMIT
0.5054602398837356 NUM_UNITS
0.04035832003946407 INJURIES_INCAPACITATING
0.16137167324895446 CRASH_HOUR
0.011935469604048803 CRASH_DAY_OF_WEEK
0.01734885785502332 CRASH_MONTH
0.08440392034248823 AGE
0.05473877674378491 VEHICLE_YEAR


### Logistic Regression on numeric data


In [54]:
#create baseline model 
baseline_model = LogisticRegression(random_state=42)

# Use cross_val_score with scoring="neg_log_loss" to evaluate the model
# on X_train and y_train
baseline_neg_log_loss_cv = cross_val_score(baseline_model, X_train, y_train, scoring="neg_log_loss")

baseline_log_loss = -(baseline_neg_log_loss_cv.mean())
baseline_log_loss

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6636212745678178

In [55]:
#see what our log-loss would be if our model choose the majority class 
log_loss(y_train, np.zeros(len(y_train)))

20.938989624794132

##### The lower the Log-Loss the better. So, although log-loss isn't the most translational metric, we can see that our model us definitely better than just guesing the majority every time. 

### Longer way of doing what we did above, by hand 

In [None]:
# Negative log loss doesn't exist as something we can import,
# but we can create it
neg_log_loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
# Instantiate the model (same as previous example)
baseline_model = LogisticRegression(random_state=42)
# Create a list to hold the score from each fold
kfold_scores = np.ndarray(5)

# Instantiate a splitter object and loop over its result
kfold = StratifiedKFold()
for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
    # Extract train and validation subsets using the provided indices
    X_t, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Clone the baseline model and fit it on the train subset
    train_model_1 = clone(baseline_model)
    train_model_1.fit(X_t, y_t)
    
    # Evaluate the provided model on the validation subset
    neg_log_loss_score = neg_log_loss(train_model_1, X_val, y_val)
    kfold_scores[fold] = neg_log_loss_score
    
-(kfold_scores.mean())

In [None]:
print(baseline_neg_log_loss_cv)
print(kfold_scores)

### Writing a Custom Cross Validation Function with StratifiedKFold - Trying Something


In [None]:
# #Import relevant sklearn and imblearn classes
# from sklearn.preprocessing import StandardScaler
# from imblearn.over_sampling import SMOTE

# def custom_cross_val_score(estimator, X, y):
#     # Create a list to hold the scores from each fold
#     kfold_train_scores = np.ndarray(5)
#     kfold_val_scores = np.ndarray(5)

#     # Instantiate a splitter object and loop over its result
#     kfold = StratifiedKFold(n_splits=5)
#     for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
#         # Extract train and validation subsets using the provided indices
#         X_t, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_t, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Instantiate StandardScaler
#         scaler = StandardScaler()
#         # Fit and transform X_t
#         X_t_scaled = scaler.fit_transform(X_t)
#         # Transform X_val
#         X_val_scaled = scaler.transform(X_val)
        
#         # Instantiate SMOTE with random_state=42 and sampling_strategy=0.78
#         sm = SMOTE(random_state=42, sampling_strategy=0.78)
#         # Fit and transform X_t_scaled and y_t using sm
#         X_t_oversampled, y_t_oversampled = sm.fit_resample(X_t_scaled, y_t)
        

        
#         # Clone the provided model and fit it on the train subset
#         temp_model = clone(estimator)
#         temp_model.fit(X_t_oversampled, y_t_oversampled)
        
#         # Evaluate the provided model on the train and validation subsets
#         neg_log_loss_score_train = neg_log_loss(temp_model, X_t_oversampled, y_t_oversampled)
#         neg_log_loss_score_val = neg_log_loss(temp_model, X_val_scaled, y_val)
#         kfold_train_scores[fold] = neg_log_loss_score_train
#         kfold_val_scores[fold] = neg_log_loss_score_val
        
#     return kfold_train_scores, kfold_val_scores

# model_with_preprocessing = LogisticRegression(random_state=42, class_weight={1: 0.78})
# preprocessed_train_scores, preprocessed_neg_log_loss_cv = custom_cross_val_score(model_with_preprocessing, X_train, y_train)
# - (preprocessed_neg_log_loss_cv.mean())

In [None]:
# print(-baseline_neg_log_loss_cv.mean())
# print(-preprocessed_neg_log_loss_cv.mean())

In [None]:
# print("Train:     ", -preprocessed_train_scores)
# print("Validation:", -preprocessed_neg_log_loss_cv)

In [None]:
# model_with_preprocessing.get_params()

### Trying something

In [None]:
# Import relevant sklearn and imblearn classes
# from sklearn.preprocessing import StandardScaler
# from imblearn.over_sampling import SMOTE

# def custom_cross_val_score(estimator, X, y):
#     # Create a list to hold the scores from each fold
#     kfold_train_scores = np.ndarray(5)
#     kfold_val_scores = np.ndarray(5)

#     # Instantiate a splitter object and loop over its result
#     kfold = StratifiedKFold(n_splits=5)
#     for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
#         # Extract train and validation subsets using the provided indices
#         X_t, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_t, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Instantiate StandardScaler
#         scaler = StandardScaler()
#         # Fit and transform X_t
#         X_t_scaled = scaler.fit_transform(X_t)
#         # Transform X_val
#         X_val_scaled = scaler.transform(X_val)
        
#         # Instantiate SMOTE with random_state=42 and sampling_strategy=0.28
#         sm = SMOTE(random_state=42, sampling_strategy=0.78)
#         # Fit and transform X_t_scaled and y_t using sm
#         X_t_oversampled, y_t_oversampled = sm.fit_resample(X_t_scaled, y_t)
        

        
#         # Clone the provided model and fit it on the train subset
#         temp_model = clone(estimator)
#         temp_model.fit(X_t_oversampled, y_t_oversampled)
        
#         # Evaluate the provided model on the train and validation subsets
#         neg_log_loss_score_train = neg_log_loss(temp_model, X_t_oversampled, y_t_oversampled)
#         neg_log_loss_score_val = neg_log_loss(temp_model, X_val_scaled, y_val)
#         kfold_train_scores[fold] = neg_log_loss_score_train
#         kfold_val_scores[fold] = neg_log_loss_score_val
        
#     return kfold_train_scores, kfold_val_scores

# model_with_preprocessing = LogisticRegression(random_state=42, class_weight={1: 0.28})
# preprocessed_train_scores, preprocessed_neg_log_loss_cv = custom_cross_val_score(model_with_preprocessing, X_train, y_train)
# - (preprocessed_neg_log_loss_cv.mean())

In [None]:
# model_less_regularization = LogisticRegression(
#     random_state=42,
#     class_weight={1: 0.28},
#     C=1e5
# )

In [None]:
# # Check variable type
# assert type(model_less_regularization) == LogisticRegression

# # Check params
# assert model_less_regularization.get_params()["random_state"] == 42
# assert model_less_regularization.get_params()["class_weight"] == {1: 0.28}
# assert model_less_regularization.get_params()["C"] != 1.0

In [None]:
# less_regularization_train_scores, less_regularization_val_scores = custom_cross_val_score(
#     model_less_regularization,
#     X_train,
#     y_train
# )

# print("Previous Model")
# print("Train average:     ", -preprocessed_train_scores.mean())
# print("Validation average:", -preprocessed_neg_log_loss_cv.mean())
# print("Current Model")
# print("Train average:     ", -less_regularization_train_scores.mean())
# print("Validation average:", -less_regularization_val_scores.mean())

In [None]:
# # One hot Encode state
# ohe = OneHotEncoder(sparse = False, handle_unknown= "ignore")

# # fit ohe on small train data
# ohe.fit(X_train[['state']])
# ohe.fit(X_test[['state']])

# # access the column names of the states
# col_names = ohe.categories_[0]

# # make a df with encoded states
# train_state_encoded = pd.DataFrame(ohe.transform(X_train[["state"]]), 
#                                index = X_train.index, 
#                                columns = col_names)
# train_test_encoded = pd.DataFrame(ohe.transform(X_test[["state"]]), 
#                                index = X_test.index, 
#                                columns = col_names)
# # combine encoded states with X_t and drop old 'state' column
# X_train = pd.concat([X_train.drop("state", axis = 1), train_state_encoded], axis = 1)
# X_test = pd.concat([X_test.drop("state", axis = 1), train_test_encoded], axis = 1)

In [None]:
cpc_df

### Train Test Split on full dataset


In [None]:
X = cpc_df.drop(columns='Target')
y = cpc_df["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [None]:
# One hot Encode state
# ohe = OneHotEncoder(sparse = False, handle_unknown= "ignore")

# # fit ohe on small train data
# ohe.fit(X_train[['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE',
#                  'ROADWAY_SURFACE_COND','CRASH_TYPE',
#                  'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I',
#                  'PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED',
#                  'DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER',
#                  'TOWED_I','TOP_MAKES']])
# ohe.fit(X_test[['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE',
#                  'ROADWAY_SURFACE_COND','CRASH_TYPE',
#                  'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I',
#                  'PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED',
#                  'DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER',
#                  'TOWED_I','TOP_MAKES']])

# # access the column names of the states
# col_names = ohe.categories_[0:19]

# #make a df with encoded states
# train_state_encoded = pd.DataFrame(ohe.transform(X_train[['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE',
#                  'ROADWAY_SURFACE_COND','CRASH_TYPE',
#                  'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I',
#                  'PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED',
#                  'DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER',
#                  'TOWED_I','TOP_MAKES']]), 
#                                index = X_train.index, 
#                                columns = col_names)
# train_test_encoded = pd.DataFrame(ohe.transform(X_test[['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE',
#                  'ROADWAY_SURFACE_COND','CRASH_TYPE',
#                  'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I',
#                  'PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED',
#                  'DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER',
#                  'TOWED_I','TOP_MAKES']]), 
#                                index = X_test.index, 
#                                columns = col_names)
# # combine encoded states with X_t and drop old 'state' column
# X_train = pd.concat([X_train.drop([['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE',
#                  'ROADWAY_SURFACE_COND','CRASH_TYPE',
#                  'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I',
#                  'PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED',
#                  'DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER',
#                  'TOWED_I','TOP_MAKES']], axis = 1), train_state_encoded], axis = 1)
# X_test = pd.concat([X_test.drop([['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE',
#                  'ROADWAY_SURFACE_COND','CRASH_TYPE',
#                  'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I',
#                  'PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED',
#                  'DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER',
#                  'TOWED_I','TOP_MAKES']], axis = 1), train_test_encoded], axis = 1)

In [None]:
X_train.index

In [None]:
col_names

In [None]:
ohe.categories_[0:1]

In [None]:
ohe.categories_[0:5]

In [None]:
X = cpc_df.drop(columns='Target')
y = cpc_df["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [None]:
#get categorical columns
cat_cols = [c for c in cpc_df.columns if cpc_df[c].dtype == "O"]
cat_cols

In [None]:
from sklearn.compose import ColumnTransformer

#create encoder object - to help convert cat. variables to new columns
encoder = OneHotEncoder(handle_unknown = 'error',
                       drop = 'if_binary',
                       categories='auto')

#create columntransformer object - to help merge transformed columns
#with the rest of the dataset

ct = ColumnTransformer(transformers=[('ohe', encoder, cat_cols)],
                      remainder='passthrough')

ct.fit(X_train)
X_train_enc = ct.transform(X_train)
X_test_enc = ct.transform(X_test)

In [None]:
#X_train

In [None]:
X_train_enc

In [None]:
pd.DataFrame(X_train_enc[0:275], columns=ct.get_feature_names()).head()

In [None]:
X_train_enc

In [None]:
print(X_train_enc.column)

In [None]:
#method from google 

categorical_cols = ['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE','ROADWAY_SURFACE_COND','CRASH_TYPE','INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I','PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED','DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER','TOWED_I','TOP_MAKES']

from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
cpc_df[categorical_cols] = cpc_df[categorical_cols].apply(lambda col: le.fit_transform(col))    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(cpc_df[categorical_cols])

#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=cpc_df.index)

#Extract only the columns that didnt need to be encoded
data_other_cols = cpc_df.drop(columns=categorical_cols)

#Concatenate the two dataframes : 
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [None]:
cpc_df.info()

In [None]:
cpc_df['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE','ROADWAY_SURFACE_COND','ROADWAY_SURFACE_COND','CRASH_TYPE','INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I','PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED','DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER','TOWED_I','TOP_MAKES']

In [None]:
cpc_cat_df = cpc_df[['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE','ROADWAY_SURFACE_COND','ROADWAY_SURFACE_COND','CRASH_TYPE','INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I','PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED','DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER','TOWED_I','TOP_MAKES']]

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

transformed_data = onehotencoder.fit_transform(cpc_cat_df)

# the above transformed_data is an array so convert it to dataframe
encoded_data = pd.DataFrame(transformed_data, index=cpc_cat_df.index)

# now concatenate the original data and the encoded data using pandas
concatenated_data = pd.concat([cpc_cat_df, encoded_data], axis=1)

In [None]:
concatenated_data.info()