## problem statement

Stakeholder: City planner / City

Problem to solve: What factors contribute to traffic incidents that result in death in Chicago at night during a full moon?

single drivers? multiple vehicles?

What factors contribute to traffic incidents that result in death/incapacitation in Chicago during rush hour (5pm-7pm)

What will our model produce?: Our model will determine which primary contributors are most relevant in a car accident in Chicago

In [1]:
import pandas as pd

In [2]:
crash_df = pd.read_csv('data/crashes.csv')

In [3]:
people_df = pd.read_csv('data/people.csv', low_memory=False)

In [4]:
vehicle_df = pd.read_csv('data/vehicles.csv', low_memory=False)

## dropping columns

In [25]:
crash_df_dropped = crash_df[['CRASH_RECORD_ID', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 
                                          'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'LANE_CNT', 
                                          'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'NOT_RIGHT_OF_WAY_I',
                                          'HIT_AND_RUN_I', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 
                                          'DOORING_I','WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 
                                          'INJURIES_FATAL', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 
                                          'LATITUDE', 'LONGITUDE']]

In [26]:
people_df_dropped = people_df[['CRASH_RECORD_ID', 'PERSON_TYPE', 'AGE', 'DRIVERS_LICENSE_STATE',
                                'DRIVER_ACTION', 'DRIVER_VISION', 'BAC_RESULT VALUE', 
                                'INJURY_CLASSIFICATION']]

In [27]:
vehicle_df_dropped = vehicle_df[['CRASH_RECORD_ID', 'VEHICLE_YEAR', 'MANEUVER']]

## subset of crashes between 10pm and 5am

In [28]:
night_time_df = crash_df_dropped.copy()

In [29]:
night_time_df['INJURIES_FATAL'].value_counts()

0.0    552535
1.0       526
2.0        31
3.0         5
4.0         1
Name: INJURIES_FATAL, dtype: int64

In [30]:
night_time_df = night_time_df[(night_time_df['CRASH_HOUR'] >= 22) | (night_time_df['CRASH_HOUR'] <= 5)]
night_time_df.columns

Index(['CRASH_RECORD_ID', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'LANE_CNT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'NOT_RIGHT_OF_WAY_I',
       'HIT_AND_RUN_I', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE',
       'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I',
       'INJURIES_FATAL', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LATITUDE', 'LONGITUDE'],
      dtype='object')

## joining

In [31]:
night_time_df.shape, people_df.shape

((81180, 22), (1224613, 30))

In [32]:
people_df['CRASH_RECORD_ID'].is_unique

False

In [33]:
merge1 = pd.merge(night_time_df, people_df, how='left', on='CRASH_RECORD_ID')

In [34]:
merge1

Unnamed: 0,CRASH_RECORD_ID,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,LANE_CNT,ROADWAY_SURFACE_COND,ROAD_DEFECT,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,...,EMS_RUN_NO,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,PEDPEDAL_ACTION,PEDPEDAL_VISIBILITY,PEDPEDAL_LOCATION,BAC_RESULT,BAC_RESULT VALUE,CELL_PHONE_USE
0,f8960f698e870ebdc60b521b2a141a5395556bc3704191...,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,,DRY,NO DEFECTS,,Y,...,,UNKNOWN,UNKNOWN,UNKNOWN,,,,TEST NOT OFFERED,,
1,f8960f698e870ebdc60b521b2a141a5395556bc3704191...,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,,DRY,NO DEFECTS,,Y,...,,,,,,,,,,
2,f8960f698e870ebdc60b521b2a141a5395556bc3704191...,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,,DRY,NO DEFECTS,,Y,...,,,,,,,,,,
3,00e47f189660cd8ba1e85fc63061bf1d8465184393f134...,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",4.0,DRY,NO DEFECTS,,,...,,UNKNOWN,NOT OBSCURED,NORMAL,,,,TEST NOT OFFERED,,
4,00e47f189660cd8ba1e85fc63061bf1d8465184393f134...,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",4.0,DRY,NO DEFECTS,,,...,,UNKNOWN,NOT OBSCURED,NORMAL,,,,TEST NOT OFFERED,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164914,c18e3ac27d367f6c0a7c71cfff3a9f21acc6557d4b04a4...,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",,DRY,NO DEFECTS,,Y,...,,,,,,,,,,
164915,c18e3ac27d367f6c0a7c71cfff3a9f21acc6557d4b04a4...,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",,DRY,NO DEFECTS,,Y,...,,,,,,,,,,
164916,c18e3ac27d367f6c0a7c71cfff3a9f21acc6557d4b04a4...,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",,DRY,NO DEFECTS,,Y,...,,,,,,,,,,
164917,c18e3ac27d367f6c0a7c71cfff3a9f21acc6557d4b04a4...,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",,DRY,NO DEFECTS,,Y,...,,,,,,,,,,


## exploring columns

### injury in people df

In [35]:
# includes all people including cyclists, passengers, drivers, etc

In [36]:
merge1['INJURY_CLASSIFICATION'].value_counts()

NO INDICATION OF INJURY     145253
NONINCAPACITATING INJURY     11441
REPORTED, NOT EVIDENT         4822
INCAPACITATING INJURY         2523
FATAL                          300
Name: INJURY_CLASSIFICATION, dtype: int64

In [37]:
merge1['INJURY_CLASSIFICATION'].count()

164339

In [38]:
# exploring driver vision
merge1['DRIVER_VISION'].value_counts()

UNKNOWN                   69104
NOT OBSCURED              56238
OTHER                      1086
WINDSHIELD (WATER/ICE)      503
MOVING VEHICLES             289
PARKED VEHICLES             210
TREES, PLANTS                51
BLINDED - HEADLIGHTS         28
BUILDINGS                    18
HILLCREST                    14
EMBANKMENT                   12
BLOWING MATERIALS             9
SIGNBOARD                     4
BLINDED - SUNLIGHT            2
Name: DRIVER_VISION, dtype: int64

In [39]:
# exploring road surface cond
merge1['ROADWAY_SURFACE_COND'].value_counts()

DRY                118868
WET                 27039
UNKNOWN             11689
SNOW OR SLUSH        5543
ICE                  1267
OTHER                 445
SAND, MUD, DIRT        68
Name: ROADWAY_SURFACE_COND, dtype: int64

In [40]:
# exploring lighting cond
merge1['LIGHTING_CONDITION'].value_counts()

DARKNESS, LIGHTED ROAD    126219
DARKNESS                   19429
DAYLIGHT                    7312
UNKNOWN                     5929
DAWN                        3962
DUSK                        2068
Name: LIGHTING_CONDITION, dtype: int64

## create feature columns

In [49]:
# bin unknown with darkness
merge1.loc[merge1['LIGHTING_CONDITION'] == 'UNKNOWN', 'LIGHTING_CONDITION'] = 'DARKNESS'

In [48]:
merge1['LIGHTING_CONDITION'].value_counts()

DARKNESS, LIGHTED ROAD    126219
DARKNESS                   25358
DAYLIGHT                    7312
DAWN                        3962
DUSK                        2068
Name: LIGHTING_CONDITION, dtype: int64

## create target column

In [209]:
# fatal / incapacitate = 1
merge1.loc[(merge1['INJURY_CLASSIFICATION'] == 'FATAL') | 
           (merge1['INJURY_CLASSIFICATION'] == 'INCAPACITATING INJURY'), 'INJURY_CLASSIFICATION'] = 1

# else = 0
merge1.loc[(merge1['INJURY_CLASSIFICATION'] == 'NO INDICATION OF INJURY') | 
           (merge1['INJURY_CLASSIFICATION'] == 'NONINCAPACITATING INJURY') |
           (merge1['INJURY_CLASSIFICATION'] == 'REPORTED, NOT EVIDENT'), 'INJURY_CLASSIFICATION'] = 0

merge1['INJURY_CLASSIFICATION'].fillna(0, inplace=True)

In [210]:
merge1['INJURY_CLASSIFICATION'].value_counts() # total 164,339

0    162096
1      2823
Name: INJURY_CLASSIFICATION, dtype: int64

## first model?

In [112]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [211]:
X = pd.DataFrame(merge1['LIGHTING_CONDITION'])
y = merge1['INJURY_CLASSIFICATION']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [212]:
y_train.isna().value_counts()

False    131935
Name: INJURY_CLASSIFICATION, dtype: int64

In [213]:
y_test.isna().value_counts()

False    32984
Name: INJURY_CLASSIFICATION, dtype: int64

In [214]:
ohe = OneHotEncoder(sparse=True)

ohe.fit_transform(X_train).toarray()

array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [217]:
column_names_lighting = X_train.value_counts().index.tolist()

X_train = pd.DataFrame(ohe.fit_transform(X_train).toarray())

In [218]:
ohe.get_feature_names()

array(['x0_DARKNESS', 'x0_DARKNESS, LIGHTED ROAD', 'x0_DAWN',
       'x0_DAYLIGHT', 'x0_DUSK'], dtype=object)

In [219]:
y_train.value_counts()

0    129667
1      2268
Name: INJURY_CLASSIFICATION, dtype: int64

In [220]:
X_train.value_counts()

0    1    2    3    4  
0.0  1.0  0.0  0.0  0.0    101137
1.0  0.0  0.0  0.0  0.0     20126
0.0  0.0  0.0  1.0  0.0      5868
          1.0  0.0  0.0      3160
          0.0  0.0  1.0      1644
dtype: int64

In [221]:
X_train.isna().value_counts()

0      1      2      3      4    
False  False  False  False  False    131935
dtype: int64

In [222]:
y_train.isna().value_counts()

False    131935
Name: INJURY_CLASSIFICATION, dtype: int64

In [223]:
lr = LogisticRegression(random_state=24)

lr.fit(X_train, y_train)

LogisticRegression(random_state=24)