# Data Sources

Build and explain a model that predicts the primary cause of car accidents using a dataset from the City of Chicago. Aim for a Proof of Concept, focusing on model interpretability and iterating to find the best approach. Evaluation involves choosing the appropriate metric and implementing cross-validation for results.

[crashes](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if)

[vehicles](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Vehicles/68nd-jvt3)

[people](https://data.cityofchicago.org/Transportation/Traffic-Crashes-People/u6pd-qa9d)

Don't forget about Tableau visualizations.

In [40]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [41]:
d_crashes = pd.read_csv('data/Traffic_Crashes_-_Crashes.csv')
d_vehicles = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [42]:
d_crashes.isna().sum().to_csv('data/crashes_nulls.csv')

In [43]:
d_crashes.shape

(732329, 49)

In [44]:
d_crashes.isna().sum()

CRASH_RECORD_ID                       0
RD_NO                              4669
CRASH_DATE_EST_I                 677005
CRASH_DATE                            0
POSTED_SPEED_LIMIT                    0
TRAFFIC_CONTROL_DEVICE                0
DEVICE_CONDITION                      0
WEATHER_CONDITION                     0
LIGHTING_CONDITION                    0
FIRST_CRASH_TYPE                      0
TRAFFICWAY_TYPE                       0
LANE_CNT                         533326
ALIGNMENT                             0
ROADWAY_SURFACE_COND                  0
ROAD_DEFECT                           0
REPORT_TYPE                       20665
CRASH_TYPE                            0
INTERSECTION_RELATED_I           564347
NOT_RIGHT_OF_WAY_I               698193
HIT_AND_RUN_I                    504200
DAMAGE                                0
DATE_POLICE_NOTIFIED                  0
PRIM_CONTRIBUTORY_CAUSE               0
SEC_CONTRIBUTORY_CAUSE                0
STREET_NO                             0


In [45]:
d_vehicles.shape

(1491796, 72)

In [46]:
d_vehicles.isna().sum()

CRASH_UNIT_ID                    0
CRASH_RECORD_ID                  0
RD_NO                         9693
CRASH_DATE                       0
UNIT_NO                          0
                            ...   
CARGO_BODY_TYPE            1479527
LOAD_TYPE                  1480065
HAZMAT_OUT_OF_SERVICE_I    1481133
MCS_OUT_OF_SERVICE_I       1480888
HAZMAT_CLASS               1490765
Length: 72, dtype: int64

In [47]:
d_vehicles.isna().sum().to_csv('data/vehicles_nulls.csv')

In [48]:
# weeew lad
print(d_crashes.shape[0] + d_vehicles.shape[0])

2224125


In [49]:
print(d_crashes.shape[1] + d_vehicles.shape[1])

121


In [50]:
# Merge d_crashes and d_vehicles on CRASH_RECORD_ID
df = pd.merge(d_crashes, d_vehicles, on='CRASH_RECORD_ID')

In [51]:
df.shape

(1491794, 120)

In [52]:
df.isna().sum().to_csv('data/df_nulls.csv')

In [53]:
'''
drom from df 
CMV_ID
USDOT_NO
CCMC_NO
ILCC_NO
COMMERCIAL_SRC
GVWR
CARRIER_NAME
CARRIER_STATE
CARRIER_CITY
HAZMAT_PLACARDS_I
HAZMAT_NAME
UN_NO
HAZMAT_PRESENT_I
HAZMAT_REPORT_I
HAZMAT_REPORT_NO
MCS_REPORT_I
MCS_REPORT_NO
HAZMAT_VIO_CAUSE_CRASH_I
MCS_VIO_CAUSE_CRASH_I
IDOT_PERMIT_NO
WIDE_LOAD_I
TRAILER1_WIDTH
TRAILER2_WIDTH
TRAILER1_LENGTH
TRAILER2_LENGTH
TOTAL_VEHICLE_LENGTH
AXLE_CNT
VEHICLE_CONFIG
CARGO_BODY_TYPE
LOAD_TYPE
HAZMAT_OUT_OF_SERVICE_I
MCS_OUT_OF_SERVICE_I
HAZMAT_CLASS
PHOTOS_TAKEN_I
STATEMENTS_TAKEN_I
DOORING_I
WORK_ZONE_I
WORK_ZONE_TYPE
WORKERS_PRESENT_I
TOWED_I
FIRE_I
TOWED_BY
TOWED_TO
AREA_00_I
AREA_01_I
AREA_02_I
AREA_03_I
AREA_04_I
AREA_05_I
AREA_06_I
AREA_07_I
AREA_08_I
AREA_09_I
AREA_10_I
AREA_11_I
AREA_12_I
AREA_99_I
'''

df.drop(columns=['CMV_ID',
                 'USDOT_NO',
                 'CCMC_NO',
                 'ILCC_NO',
                 'COMMERCIAL_SRC',
                 'GVWR',
                 'CARRIER_NAME',
                 'CARRIER_STATE',
                 'CARRIER_CITY',
                 'HAZMAT_PLACARDS_I',
                 'HAZMAT_NAME',
                 'UN_NO',
                 'HAZMAT_PRESENT_I',
                 'HAZMAT_REPORT_I',
                 'HAZMAT_REPORT_NO',
                 'MCS_REPORT_I',
                 'MCS_REPORT_NO',
                 'HAZMAT_VIO_CAUSE_CRASH_I',
                 'MCS_VIO_CAUSE_CRASH_I',
                 'IDOT_PERMIT_NO',
                 'WIDE_LOAD_I',
                 'TRAILER1_WIDTH',
                 'TRAILER2_WIDTH',
                 'TRAILER1_LENGTH',
                 'TRAILER2_LENGTH',
                 'TOTAL_VEHICLE_LENGTH',
                 'AXLE_CNT',
                 'VEHICLE_CONFIG',
                 'CARGO_BODY_TYPE',
                 'LOAD_TYPE',
                 'HAZMAT_OUT_OF_SERVICE_I',
                 'MCS_OUT_OF_SERVICE_I',
                 'HAZMAT_CLASS',
                 
                 'PHOTOS_TAKEN_I',
                 'STATEMENTS_TAKEN_I',
                 'DOORING_I',
                 'WORK_ZONE_I',
                 'WORK_ZONE_TYPE',
                 'WORKERS_PRESENT_I',
                 
                 'TOWED_I',
                 'FIRE_I',
                 
                 'TOWED_BY',
                 'TOWED_TO',
                 'AREA_00_I',
                 'AREA_01_I',
                 'AREA_02_I',
                 'AREA_03_I',
                 'AREA_04_I',
                 'AREA_05_I',
                 'AREA_06_I',
                 'AREA_07_I',
                 'AREA_08_I',
                 'AREA_09_I',
                 'AREA_10_I',
                 'AREA_11_I',
                 'AREA_12_I',
                 'AREA_99_I',
                 
                 'BEAT_OF_OCCURRENCE',
                 'VEHICLE_ID',
                 'LANE_CNT',
                 ], inplace=True)

In [54]:
df.shape

(1491794, 60)

Okay so all the useless columns have been chopped. 

The ones that are left over need to be cleaned up.

ugh

In [55]:
# output df.head() to csv
df.head().T.to_csv('data/df_head.csv')

In [56]:
# split df into categorical and numerical dataframes
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [57]:
df_cat.isna().sum()

CRASH_RECORD_ID                  0
RD_NO_x                       8978
CRASH_DATE_EST_I           1379456
CRASH_DATE_x                     0
TRAFFIC_CONTROL_DEVICE           0
DEVICE_CONDITION                 0
WEATHER_CONDITION                0
LIGHTING_CONDITION               0
FIRST_CRASH_TYPE                 0
TRAFFICWAY_TYPE                  0
ALIGNMENT                        0
ROADWAY_SURFACE_COND             0
ROAD_DEFECT                      0
REPORT_TYPE                  47778
CRASH_TYPE                       0
INTERSECTION_RELATED_I     1144582
NOT_RIGHT_OF_WAY_I         1430860
HIT_AND_RUN_I              1017966
DAMAGE                           0
DATE_POLICE_NOTIFIED             0
PRIM_CONTRIBUTORY_CAUSE          0
SEC_CONTRIBUTORY_CAUSE           0
STREET_DIRECTION                 8
STREET_NAME                      2
MOST_SEVERE_INJURY            2663
LOCATION                      9373
RD_NO_y                       9691
CRASH_DATE_y                     0
UNIT_TYPE           

In [58]:
df_num.isna().sum()

POSTED_SPEED_LIMIT                     0
STREET_NO                              0
NUM_UNITS                              0
INJURIES_TOTAL                      2642
INJURIES_FATAL                      2642
INJURIES_INCAPACITATING             2642
INJURIES_NON_INCAPACITATING         2642
INJURIES_REPORTED_NOT_EVIDENT       2642
INJURIES_NO_INDICATION              2642
INJURIES_UNKNOWN                    2642
CRASH_HOUR                             0
CRASH_DAY_OF_WEEK                      0
CRASH_MONTH                            0
LATITUDE                            9373
LONGITUDE                           9373
CRASH_UNIT_ID                          0
UNIT_NO                                0
NUM_PASSENGERS                   1270885
VEHICLE_YEAR                      269637
OCCUPANT_CNT                       33607
dtype: int64

# Pipelines:

In [None]:
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),  # Replace missing values with 0
    ('scaler', StandardScaler())  # Scale the features using standardization
])