In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [28]:
df = pd.read_csv('Traffic_Crashes_-_Crashes.csv')

In [3]:
df['datetime'] = pd.to_datetime(df['CRASH_DATE'])

In [4]:
df['datetime'].dt.month

0          3
1          3
2         10
3          1
4         10
          ..
398685     7
398686     7
398687     4
398688     4
398689    12
Name: datetime, Length: 398690, dtype: int64

In [5]:
df['datetime'].dt.hour

0         16
1         15
2         14
3         16
4         12
          ..
398685    18
398686    16
398687    16
398688    19
398689    15
Name: datetime, Length: 398690, dtype: int64

In [6]:
df['datetime'].dt.dayofweek

0         2
1         1
2         1
3         1
4         4
         ..
398685    0
398686    4
398687    0
398688    4
398689    0
Name: datetime, Length: 398690, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398690 entries, 0 to 398689
Data columns (total 50 columns):
CRASH_RECORD_ID                  398690 non-null object
RD_NO                            396907 non-null object
CRASH_DATE_EST_I                 29500 non-null object
CRASH_DATE                       398690 non-null object
POSTED_SPEED_LIMIT               398690 non-null int64
TRAFFIC_CONTROL_DEVICE           398690 non-null object
DEVICE_CONDITION                 398690 non-null object
WEATHER_CONDITION                398690 non-null object
LIGHTING_CONDITION               398690 non-null object
FIRST_CRASH_TYPE                 398690 non-null object
TRAFFICWAY_TYPE                  398690 non-null object
LANE_CNT                         198551 non-null float64
ALIGNMENT                        398690 non-null object
ROADWAY_SURFACE_COND             398690 non-null object
ROAD_DEFECT                      398690 non-null object
REPORT_TYPE                      389281 non-null o

In [8]:
df.columns

Index(['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LA

In [29]:
df['LANE_CNT'].fillna(0.0, inplace=True)

In [30]:
df.fillna('UNKNOWN', inplace=True)

In [31]:
df.drop(['CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'RD_NO', 'REPORT_TYPE', 'STREET_NO', 'BEAT_OF_OCCURRENCE', 
         'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'WORKERS_PRESENT_I', 'INJURIES_UNKNOWN', 'LONGITUDE', 'LATITUDE',
         'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL','INJURIES_INCAPACITATING', 
         'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'DAMAGE',
         'DATE_POLICE_NOTIFIED', 'CRASH_TYPE','NUM_UNITS','STREET_DIRECTION','STREET_NAME'], axis=1, inplace=True)

In [32]:
list = [3, 9, 99, 39, 1, 2, 32, 33, 6, 24, 11, 34, 18, 12, 36, 7, 14, 16, 38, 31, 22, 23, 63, 4, 26]
for n in list:
    df.drop(index=df[df['POSTED_SPEED_LIMIT'] == n].index, inplace=True)

In [37]:
df.columns

Index(['CRASH_DATE', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'STREET_NAME',
       'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LOCATION'],
      dtype='object')

In [35]:
text_col = ['TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'TRAFFICWAY_TYPE', 
            'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I',
            'HIT_AND_RUN_I', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE','DOORING_I', 'WORK_ZONE_I', 
            'WORK_ZONE_TYPE']

In [39]:
new_df = pd.DataFrame()
for label in text_col:
    encoder = LabelEncoder()
    new_df[label] = encoder.fit_transform(df[label])    
new_df

Unnamed: 0,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE
0,4,3,2,3,8,3,0,1,1,1,1,18,26,1,1,2
1,4,3,2,3,8,3,0,1,1,1,1,36,36,1,1,2
2,16,6,2,3,8,3,0,1,1,1,2,36,36,1,1,2
3,4,3,2,3,6,3,0,1,2,1,1,36,36,1,1,2
4,16,1,2,3,8,3,0,1,2,1,1,19,24,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398281,4,3,2,3,8,3,0,2,1,1,1,36,36,1,1,2
398282,16,1,2,3,8,3,0,1,1,1,1,19,17,1,1,2
398283,4,3,2,3,12,3,0,5,1,1,2,36,26,1,1,2
398284,4,3,2,3,2,3,0,1,1,1,1,19,26,1,1,2


In [None]:
crash_type = df['FIRST_CRASH_TYPE']
crash_df = pd.DataFrame(crash_type, columns=['FIRST_CRASH_TYPE'])
ohe = OneHotEncoder(handle_unknown='ignore')

In [None]:
crash_df = pd.DataFrame(ohe.fit_transform(crash_df[['FIRST_CRASH_TYPE']]).toarray())
crash_df

In [None]:
crash_col = df['FIRST_CRASH_TYPE'].unique()

In [None]:
crash_df.columns = crash_col

In [None]:
crash_df

In [None]:
result = pd.concat([df, crash_df], axis=1)

In [None]:
rear_end = result['REAR END']
y = rear_end
X = result.drop(columns=['FIRST_CRASH_TYPE', 'CRASH_DATE'], axis=1)

In [None]:
y.value_counts()

In [None]:
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=1, test_size=0.20)

In [None]:
dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(X_train, y_train)

In [None]:
y_train_pred = dtc.predict(X_train)