# Data Cleaning

## Data Sources

[crashes](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if)

[vehicles](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Vehicles/68nd-jvt3)

[people](https://data.cityofchicago.org/Transportation/Traffic-Crashes-People/u6pd-qa9d)

## Libraries

In [25]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## Importing The Data

In [26]:
d_crashes = pd.read_csv('data/Traffic_Crashes_-_Crashes.csv')
d_vehicles = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [57]:
d_crashes.shape

(732329, 49)

In [58]:
d_vehicles.shape

(1491796, 72)

### Merging The Frames

In [35]:
# Merge d_crashes and d_vehicles on CRASH_RECORD_ID
df = pd.merge(d_crashes, d_vehicles, on='CRASH_RECORD_ID')

In [59]:
df.shape

(1491794, 60)

In [38]:
'''
drom from df 
CMV_ID
USDOT_NO
CCMC_NO
ILCC_NO
COMMERCIAL_SRC
GVWR
CARRIER_NAME
CARRIER_STATE
CARRIER_CITY
HAZMAT_PLACARDS_I
HAZMAT_NAME
UN_NO
HAZMAT_PRESENT_I
HAZMAT_REPORT_I
HAZMAT_REPORT_NO
MCS_REPORT_I
MCS_REPORT_NO
HAZMAT_VIO_CAUSE_CRASH_I
MCS_VIO_CAUSE_CRASH_I
IDOT_PERMIT_NO
WIDE_LOAD_I
TRAILER1_WIDTH
TRAILER2_WIDTH
TRAILER1_LENGTH
TRAILER2_LENGTH
TOTAL_VEHICLE_LENGTH
AXLE_CNT
VEHICLE_CONFIG
CARGO_BODY_TYPE
LOAD_TYPE
HAZMAT_OUT_OF_SERVICE_I
MCS_OUT_OF_SERVICE_I
HAZMAT_CLASS
PHOTOS_TAKEN_I
STATEMENTS_TAKEN_I
DOORING_I
WORK_ZONE_I
WORK_ZONE_TYPE
WORKERS_PRESENT_I
TOWED_I
FIRE_I
TOWED_BY
TOWED_TO
AREA_00_I
AREA_01_I
AREA_02_I
AREA_03_I
AREA_04_I
AREA_05_I
AREA_06_I
AREA_07_I
AREA_08_I
AREA_09_I
AREA_10_I
AREA_11_I
AREA_12_I
AREA_99_I
'''

df.drop(columns=['CMV_ID',
                 'USDOT_NO',
                 'CCMC_NO',
                 'ILCC_NO',
                 'COMMERCIAL_SRC',
                 'GVWR',
                 'CARRIER_NAME',
                 'CARRIER_STATE',
                 'CARRIER_CITY',
                 'HAZMAT_PLACARDS_I',
                 'HAZMAT_NAME',
                 'UN_NO',
                 'HAZMAT_PRESENT_I',
                 'HAZMAT_REPORT_I',
                 'HAZMAT_REPORT_NO',
                 'MCS_REPORT_I',
                 'MCS_REPORT_NO',
                 'HAZMAT_VIO_CAUSE_CRASH_I',
                 'MCS_VIO_CAUSE_CRASH_I',
                 'IDOT_PERMIT_NO',
                 'WIDE_LOAD_I',
                 'TRAILER1_WIDTH',
                 'TRAILER2_WIDTH',
                 'TRAILER1_LENGTH',
                 'TRAILER2_LENGTH',
                 'TOTAL_VEHICLE_LENGTH',
                 'AXLE_CNT',
                 'VEHICLE_CONFIG',
                 'CARGO_BODY_TYPE',
                 'LOAD_TYPE',
                 'HAZMAT_OUT_OF_SERVICE_I',
                 'MCS_OUT_OF_SERVICE_I',
                 'HAZMAT_CLASS',
                 
                 'PHOTOS_TAKEN_I',
                 'STATEMENTS_TAKEN_I',
                 'DOORING_I',
                 'WORK_ZONE_I',
                 'WORK_ZONE_TYPE',
                 'WORKERS_PRESENT_I',
                 
                 'TOWED_I',
                 'FIRE_I',
                 
                 'TOWED_BY',
                 'TOWED_TO',
                 'AREA_00_I',
                 'AREA_01_I',
                 'AREA_02_I',
                 'AREA_03_I',
                 'AREA_04_I',
                 'AREA_05_I',
                 'AREA_06_I',
                 'AREA_07_I',
                 'AREA_08_I',
                 'AREA_09_I',
                 'AREA_10_I',
                 'AREA_11_I',
                 'AREA_12_I',
                 'AREA_99_I',
                 
                 'BEAT_OF_OCCURRENCE',
                 'VEHICLE_ID',
                 'LANE_CNT',
                 ], inplace=True)

In [39]:
df.shape

(1491794, 60)

Okay so all the useless columns have been chopped. 

The ones that are left over need to be cleaned up.

ugh

In [40]:
# output df.head() to csv
df.head().T.to_csv('data/df_head.csv')

In [41]:
# split df into categorical and numerical dataframes
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

# Looking into categorical

In [42]:
df_cat.isna().sum()

CRASH_RECORD_ID                  0
RD_NO_x                       8978
CRASH_DATE_EST_I           1379456
CRASH_DATE_x                     0
TRAFFIC_CONTROL_DEVICE           0
DEVICE_CONDITION                 0
WEATHER_CONDITION                0
LIGHTING_CONDITION               0
FIRST_CRASH_TYPE                 0
TRAFFICWAY_TYPE                  0
ALIGNMENT                        0
ROADWAY_SURFACE_COND             0
ROAD_DEFECT                      0
REPORT_TYPE                  47778
CRASH_TYPE                       0
INTERSECTION_RELATED_I     1144582
NOT_RIGHT_OF_WAY_I         1430860
HIT_AND_RUN_I              1017966
DAMAGE                           0
DATE_POLICE_NOTIFIED             0
PRIM_CONTRIBUTORY_CAUSE          0
SEC_CONTRIBUTORY_CAUSE           0
STREET_DIRECTION                 8
STREET_NAME                      2
MOST_SEVERE_INJURY            2663
LOCATION                      9373
RD_NO_y                       9691
CRASH_DATE_y                     0
UNIT_TYPE           

In [55]:
# print out the value_counts for every column in df_cat
for col in df_cat.columns:
    print(col)
    print(df_cat[col].value_counts())
    print('')

CRASH_RECORD_ID
645cdd6fd3ed2f043eefdc48230464a8ad66127691a2a2781240f5c7570ff5a87aaa6cf887bc1f76e93cfcb770359fb8136d59a40d8c18b62133fec286670279    18
e4f07da854d2b7be0c4c0903296e6f1a1f0109ddebca9d5b5dc399ffc698f2118f0f658ad5b1e3559714e0a3e2550720e3cb88e9753ce9a93ca3a20fa553fed7    18
313777c940c68d531b001269eab36bde9d156e2423cc256d7df03cb2405b50c7e079c26ffc29cc338275d71ed333b069fb95056fa81364c64e71cc27d9f1bc49    18
fb8d94334f5987710b7aa9cb4b0302f8e4c8f5f6e6d9f50ff0502a143737b3bfe95c37d970bcab190390797ad5c8fb6a9b6909f73186488c9b91267a4b844c65    16
d4c3f39de39d4f08abf19fb11df98628adc0f6024c7d10d266df234e93f6241fd3f993e1cf002806cced9c88c0cf839fd7cc91ec161e544c4273ed2c7186fd97    16
                                                                                                                                    ..
70b3b935ff4c138023dca2e4af16910e5368fc2766fe4e614f41f8e9fcdea99f9364dfa3072486cf0d1467c66cc3e00950adadca0b99ae82dcbe6cc2a350c1af     1
39a3717359a2eb8eaf98764f790ca9a4acd6ab2

In [None]:
df_cat_dropped_cols = df_cat.drop(columns=['RD_NO_x',
                'CRASH_DATE_EST_I', 
                'REPORT_TYPE', 
                'STREET_DIRECTION',
                'STREET_NAME',
                'LOCATION', 
                'RD_NO_y', 
                'CMRC_VEH_I', 
                'LIC_PLATE_STATE', 
                'TRAVEL_DIRECTION'
                 ])

# Preprocessing Pipelines:

In [44]:
preprocessing_pipeline_num = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),  # Replace missing values with 0
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [45]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [46]:
df_num_processed

array([[ 0.25409834, -1.09426593, -0.23036555, ...,  1.18327123,
         0.46773941,  1.19695084],
       [ 0.25409834, -1.09426593, -0.23036555, ..., -0.32943142,
         0.45882345, -0.06903189],
       [ 0.25409834,  0.39233957, -0.23036555, ..., -0.32943142,
         0.44863378, -0.06903189],
       ...,
       [-1.38371563,  0.0578967 , -0.23036555, ...,  1.18327123,
         0.45500232,  1.19695084],
       [-1.38371563,  0.0578967 , -0.23036555, ..., -0.32943142,
         0.46009716, -0.06903189],
       [-1.38371563,  0.0578967 , -0.23036555, ..., -0.32943142,
         0.44226524, -1.33501463]])

In [47]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [48]:
# Check df_num_processed for nulls
df_num_processed.isna().sum()

POSTED_SPEED_LIMIT               0
STREET_NO                        0
NUM_UNITS                        0
INJURIES_TOTAL                   0
INJURIES_FATAL                   0
INJURIES_INCAPACITATING          0
INJURIES_NON_INCAPACITATING      0
INJURIES_REPORTED_NOT_EVIDENT    0
INJURIES_NO_INDICATION           0
INJURIES_UNKNOWN                 0
CRASH_HOUR                       0
CRASH_DAY_OF_WEEK                0
CRASH_MONTH                      0
LATITUDE                         0
LONGITUDE                        0
CRASH_UNIT_ID                    0
UNIT_NO                          0
NUM_PASSENGERS                   0
VEHICLE_YEAR                     0
OCCUPANT_CNT                     0
dtype: int64