# Data Cleaning

## Data Sources

[crashes](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if)

[vehicles](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Vehicles/68nd-jvt3)

[people](https://data.cityofchicago.org/Transportation/Traffic-Crashes-People/u6pd-qa9d)

## Libraries

In [1]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [2]:
# imort 'data/merged.csv'
df = pd.read_csv('data/merged.csv')

  df = pd.read_csv('data/merged.csv')


## Initial Drop

In [3]:
'''
drom from df 
CMV_ID
USDOT_NO
CCMC_NO
ILCC_NO
COMMERCIAL_SRC
GVWR
CARRIER_NAME
CARRIER_STATE
CARRIER_CITY
HAZMAT_PLACARDS_I
HAZMAT_NAME
UN_NO
HAZMAT_PRESENT_I
HAZMAT_REPORT_I
HAZMAT_REPORT_NO
MCS_REPORT_I
MCS_REPORT_NO
HAZMAT_VIO_CAUSE_CRASH_I
MCS_VIO_CAUSE_CRASH_I
IDOT_PERMIT_NO
WIDE_LOAD_I
TRAILER1_WIDTH
TRAILER2_WIDTH
TRAILER1_LENGTH
TRAILER2_LENGTH
TOTAL_VEHICLE_LENGTH
AXLE_CNT
VEHICLE_CONFIG
CARGO_BODY_TYPE
LOAD_TYPE
HAZMAT_OUT_OF_SERVICE_I
MCS_OUT_OF_SERVICE_I
HAZMAT_CLASS
PHOTOS_TAKEN_I
STATEMENTS_TAKEN_I
DOORING_I
WORK_ZONE_I
WORK_ZONE_TYPE
WORKERS_PRESENT_I
TOWED_I
FIRE_I
TOWED_BY
TOWED_TO
AREA_00_I
AREA_01_I
AREA_02_I
AREA_03_I
AREA_04_I
AREA_05_I
AREA_06_I
AREA_07_I
AREA_08_I
AREA_09_I
AREA_10_I
AREA_11_I
AREA_12_I
AREA_99_I
'''

df.drop(columns=['CMV_ID',
                 'USDOT_NO',
                 'CCMC_NO',
                 'ILCC_NO',
                 'COMMERCIAL_SRC',
                 'GVWR',
                 'CARRIER_NAME',
                 'CARRIER_STATE',
                 'CARRIER_CITY',
                 'HAZMAT_PLACARDS_I',
                 'HAZMAT_NAME',
                 'UN_NO',
                 'HAZMAT_PRESENT_I',
                 'HAZMAT_REPORT_I',
                 'HAZMAT_REPORT_NO',
                 'MCS_REPORT_I',
                 'MCS_REPORT_NO',
                 'HAZMAT_VIO_CAUSE_CRASH_I',
                 'MCS_VIO_CAUSE_CRASH_I',
                 'IDOT_PERMIT_NO',
                 'WIDE_LOAD_I',
                 'TRAILER1_WIDTH',
                 'TRAILER2_WIDTH',
                 'TRAILER1_LENGTH',
                 'TRAILER2_LENGTH',
                 'TOTAL_VEHICLE_LENGTH',
                 'AXLE_CNT',
                 'VEHICLE_CONFIG',
                 'CARGO_BODY_TYPE',
                 'LOAD_TYPE',
                 'HAZMAT_OUT_OF_SERVICE_I',
                 'MCS_OUT_OF_SERVICE_I',
                 'HAZMAT_CLASS',
                 
                 'PHOTOS_TAKEN_I',
                 'STATEMENTS_TAKEN_I',
                 'DOORING_I',
                 'WORK_ZONE_I',
                 'WORK_ZONE_TYPE',
                 'WORKERS_PRESENT_I',
                 
                 'TOWED_I',
                 'FIRE_I',
                 
                 'TOWED_BY',
                 'TOWED_TO',
                 'AREA_00_I',
                 'AREA_01_I',
                 'AREA_02_I',
                 'AREA_03_I',
                 'AREA_04_I',
                 'AREA_05_I',
                 'AREA_06_I',
                 'AREA_07_I',
                 'AREA_08_I',
                 'AREA_09_I',
                 'AREA_10_I',
                 'AREA_11_I',
                 'AREA_12_I',
                 'AREA_99_I',
                 
                 'BEAT_OF_OCCURRENCE',
                 'VEHICLE_ID',
                 'LANE_CNT',
                 ], inplace=True)

In [4]:
df.shape

(1491794, 60)

In [5]:
# remove all rows where PRIM_CONTRIBUTORY_CAUSE is UNABLE TO DETERMINE and NOT APPLICABLE
df = df[df['PRIM_CONTRIBUTORY_CAUSE'] != 'NOT APPLICABLE']
df = df[df['PRIM_CONTRIBUTORY_CAUSE'] != 'UNABLE TO DETERMINE']



In [6]:
# check
df['PRIM_CONTRIBUTORY_CAUSE'].value_counts()

FAILING TO YIELD RIGHT-OF-WAY                                                       164915
FOLLOWING TOO CLOSELY                                                               151936
IMPROPER OVERTAKING/PASSING                                                          73063
FAILING TO REDUCE SPEED TO AVOID CRASH                                               66695
IMPROPER BACKING                                                                     58478
IMPROPER LANE USAGE                                                                  54856
IMPROPER TURNING/NO SIGNAL                                                           49008
DRIVING SKILLS/KNOWLEDGE/EXPERIENCE                                                  47434
DISREGARDING TRAFFIC SIGNALS                                                         30303
WEATHER                                                                              21949
OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER     20053

In [7]:
df.shape

(845294, 60)

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,6
CRASH_RECORD_ID,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,05b1982cdba5d8a00e7e76ad1ecdab0e598429f78481d2...
RD_NO_x,JC199149,JC199149,JB422857,JB422857,JF378711
CRASH_DATE_EST_I,,,,,
CRASH_DATE_x,03/25/2019 02:43:00 PM,03/25/2019 02:43:00 PM,09/05/2018 08:40:00 AM,09/05/2018 08:40:00 AM,08/29/2022 11:30:00 AM
POSTED_SPEED_LIMIT,30,30,30,30,30
TRAFFIC_CONTROL_DEVICE,TRAFFIC SIGNAL,TRAFFIC SIGNAL,NO CONTROLS,NO CONTROLS,TRAFFIC SIGNAL
DEVICE_CONDITION,FUNCTIONING PROPERLY,FUNCTIONING PROPERLY,NO CONTROLS,NO CONTROLS,FUNCTIONING PROPERLY
WEATHER_CONDITION,CLEAR,CLEAR,CLEAR,CLEAR,CLEAR
LIGHTING_CONDITION,DAYLIGHT,DAYLIGHT,DAYLIGHT,DAYLIGHT,DAYLIGHT
FIRST_CRASH_TYPE,TURNING,TURNING,ANGLE,ANGLE,REAR END


# fixme I had to move this code to later, and it's redundant, so I just removed the dataframe from being merged back later

In [9]:
# '''Also drop the following columns:
# EXCEED_SPEED_LIMIT_I
# HIT_AND_RUN_I
# NOT_RIGHT_OF_WAY_I
# INTERSECTION_RELATED_I	
# '''

# df.drop(columns=['EXCEED_SPEED_LIMIT_I',
#                  'HIT_AND_RUN_I',
#                  'NOT_RIGHT_OF_WAY_I',
#                  'INTERSECTION_RELATED_I',
#                  ], inplace=True)

# df.shape

## Chopping down the data

There's just too much

Do after the initial clean to get a better sample. 

In [10]:
df.shape

(845294, 60)

In [11]:
df = df.sample(frac=0.25, random_state=42)

In [12]:
df.shape

(211324, 60)

In [13]:
# Define the target groups
target_groups = {
    'Failures to Follow Traffic Rules': [
        'FAILING TO YIELD RIGHT-OF-WAY',
        'FOLLOWING TOO CLOSELY',
        'IMPROPER OVERTAKING/PASSING',
        'FAILING TO REDUCE SPEED TO AVOID CRASH',
        'IMPROPER BACKING',
        'IMPROPER LANE USAGE',
        'IMPROPER TURNING/NO SIGNAL',
        'DISREGARDING TRAFFIC SIGNALS',
        'DISREGARDING STOP SIGN',
        'DISREGARDING OTHER TRAFFIC SIGNS',
        'DISREGARDING YIELD SIGN',
        'EXCEEDING SAFE SPEED FOR CONDITIONS', 
        'DRIVING ON WRONG SIDE/WRONG WAY', 
        'EXCEEDING AUTHORIZED SPEED LIMIT',
        'TURNING RIGHT ON RED', 
        'DISREGARDING ROAD MARKINGS'
    ],
    'Driver Behavior and Awareness': [
        'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE',
        'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER',
        'DISTRACTION - FROM INSIDE VEHICLE',
        'PHYSICAL CONDITION OF DRIVER',
        'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)',
        'DISTRACTION - FROM OUTSIDE VEHICLE',
        'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)',
        'CELL PHONE USE OTHER THAN TEXTING',
        'TEXTING', 
        'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)'
    ],
    'Environmental Factors': [
        'WEATHER',
        'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)',
        'ROAD CONSTRUCTION/MAINTENANCE',
        'ROAD ENGINEERING/SURFACE/MARKING DEFECTS'
    ],
    'Vehicle-related Issues': [
        'EQUIPMENT - VEHICLE CONDITION'
    ],
    'Other Specific Circumstances': [
        'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST',
        'ANIMAL',
        'RELATED TO BUS STOP',
        'BICYCLE ADVANCING LEGALLY ON RED LIGHT',
        'OBSTRUCTED CROSSWALKS',
        'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT',
        'PASSING STOPPED SCHOOL BUS'
    ]
}

# Group the targets based on the defined groups
for group, targets in target_groups.items():
    df.loc[df['PRIM_CONTRIBUTORY_CAUSE'].isin(targets), 'PRIM_CONTRIBUTORY_CAUSE'] = group

# Cut out the df_targets with the grouped targets
df_targets = df[['PRIM_CONTRIBUTORY_CAUSE']]

# Print the unique grouped targets
print(df_targets['PRIM_CONTRIBUTORY_CAUSE'].unique())


['Driver Behavior and Awareness' 'Failures to Follow Traffic Rules'
 'Vehicle-related Issues' 'Other Specific Circumstances'
 'Environmental Factors']


## Cleaning Leftover Columns

### Split Into categorical and numerical

Now that they have been merged on CRASH_RECORD_ID, I'm going to remove that column, and create a new index

In [14]:
df = df.drop('CRASH_RECORD_ID', axis=1)

In [15]:
df = df.reset_index(drop=True)

In [16]:
index = df.index

In [17]:
print(index)

RangeIndex(start=0, stop=211324, step=1)


In [18]:
# split df into categorical and numerical dataframes
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [19]:
# df_cat = categorical_data.reset_index(drop=True)
# df_num = numerical_data.reset_index(drop=True)

#### Numerical Cleaning

Okay so everything that is left over with the numerical columns should be able to be filled with 0.

In [20]:
# check df_num for nulls
df_num.isnull().sum()

POSTED_SPEED_LIMIT                    0
STREET_NO                             0
NUM_UNITS                             0
INJURIES_TOTAL                      163
INJURIES_FATAL                      163
INJURIES_INCAPACITATING             163
INJURIES_NON_INCAPACITATING         163
INJURIES_REPORTED_NOT_EVIDENT       163
INJURIES_NO_INDICATION              163
INJURIES_UNKNOWN                    163
CRASH_HOUR                            0
CRASH_DAY_OF_WEEK                     0
CRASH_MONTH                           0
LATITUDE                           1507
LONGITUDE                          1507
CRASH_UNIT_ID                         0
UNIT_NO                               0
NUM_PASSENGERS                   175133
VEHICLE_YEAR                      29160
OCCUPANT_CNT                       4642
dtype: int64

In [21]:
# fill all missing values in df_num with 0
df_num.fillna(0, inplace=True)

#### categorical cleaning

In [22]:
df_cat.isna().sum()

RD_NO_x                      1251
CRASH_DATE_EST_I           200757
CRASH_DATE_x                    0
TRAFFIC_CONTROL_DEVICE          0
DEVICE_CONDITION                0
WEATHER_CONDITION               0
LIGHTING_CONDITION              0
FIRST_CRASH_TYPE                0
TRAFFICWAY_TYPE                 0
ALIGNMENT                       0
ROADWAY_SURFACE_COND            0
ROAD_DEFECT                     0
REPORT_TYPE                  6804
CRASH_TYPE                      0
INTERSECTION_RELATED_I     149918
NOT_RIGHT_OF_WAY_I         203603
HIT_AND_RUN_I              160607
DAMAGE                          0
DATE_POLICE_NOTIFIED            0
PRIM_CONTRIBUTORY_CAUSE         0
SEC_CONTRIBUTORY_CAUSE          0
STREET_DIRECTION                1
STREET_NAME                     0
MOST_SEVERE_INJURY            165
LOCATION                     1507
RD_NO_y                      1341
CRASH_DATE_y                    0
UNIT_TYPE                     298
CMRC_VEH_I                 206968
MAKE          

In [23]:
# print out the value_counts for every column in df_cat
for col in df_cat.columns:
    print(col)
    print(df_cat[col].value_counts())
    print('')

RD_NO_x
JE120677    6
JC547924    6
JG128293    5
JB424173    5
JE194195    5
           ..
JG133628    1
JF431106    1
JC405180    1
JE258768    1
JB560498    1
Name: RD_NO_x, Length: 180529, dtype: int64

CRASH_DATE_EST_I
Y    8909
N    1658
Name: CRASH_DATE_EST_I, dtype: int64

CRASH_DATE_x
11/10/2017 10:30:00 AM    16
12/29/2020 05:00:00 PM    12
07/13/2018 03:30:00 PM     9
11/06/2017 06:00:00 PM     9
11/10/2017 10:00:00 AM     9
                          ..
12/22/2017 07:35:00 PM     1
11/01/2022 08:30:00 AM     1
02/26/2017 03:00:00 PM     1
03/08/2016 01:35:00 PM     1
12/19/2018 10:30:00 PM     1
Name: CRASH_DATE_x, Length: 160057, dtype: int64

TRAFFIC_CONTROL_DEVICE
NO CONTROLS                 107299
TRAFFIC SIGNAL               70652
STOP SIGN/FLASHER            25732
UNKNOWN                       3978
OTHER                         1499
LANE USE MARKING               471
YIELD                          405
OTHER REG. SIGN                287
PEDESTRIAN CROSSING SIGN       17

In [24]:
df_cat_dropped_cols = df_cat.drop(columns=['RD_NO_x',
                'CRASH_DATE_EST_I', 
                'REPORT_TYPE', 
                'STREET_DIRECTION',
                'STREET_NAME',
                'LOCATION', 
                'RD_NO_y', 
                'CMRC_VEH_I', 
                'LIC_PLATE_STATE', 
                'TRAVEL_DIRECTION'
                 ])

In [25]:
df_cat_dropped_cols.isna().sum()

CRASH_DATE_x                    0
TRAFFIC_CONTROL_DEVICE          0
DEVICE_CONDITION                0
WEATHER_CONDITION               0
LIGHTING_CONDITION              0
FIRST_CRASH_TYPE                0
TRAFFICWAY_TYPE                 0
ALIGNMENT                       0
ROADWAY_SURFACE_COND            0
ROAD_DEFECT                     0
CRASH_TYPE                      0
INTERSECTION_RELATED_I     149918
NOT_RIGHT_OF_WAY_I         203603
HIT_AND_RUN_I              160607
DAMAGE                          0
DATE_POLICE_NOTIFIED            0
PRIM_CONTRIBUTORY_CAUSE         0
SEC_CONTRIBUTORY_CAUSE          0
MOST_SEVERE_INJURY            165
CRASH_DATE_y                    0
UNIT_TYPE                     298
MAKE                         4643
MODEL                        4669
VEHICLE_DEFECT               4642
VEHICLE_TYPE                 4642
VEHICLE_USE                  4642
MANEUVER                     4642
EXCEED_SPEED_LIMIT_I       210799
FIRST_CONTACT_POINT          5029
dtype: int64

## Getting target variable

In [26]:
'''
remove
PRIM_CONTRIBUTORY_CAUSE
and
SEC_CONTRIBUTORY_CAUSE
from df_cat_dropped_cols
assign it to it's own dataframe called df_targets,
and then export df_targets to csv
'''

# Specifically I'm going to make prim cause the target and remove secondary, maybe I'll come back to it if I have time.
df_targets = df_cat_dropped_cols[['PRIM_CONTRIBUTORY_CAUSE']]
df_cat_dropped_cols.drop(columns=['PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE'], inplace=True)
df_targets.to_csv('data/df_targets.csv')

In [27]:
df_targets.isna().sum()

PRIM_CONTRIBUTORY_CAUSE    0
dtype: int64

In [28]:
df_targets.dtypes

PRIM_CONTRIBUTORY_CAUSE    object
dtype: object

In [29]:
df_targets.value_counts()

# give me the value counts for each column in df_targets
for col in df_targets.columns:
    print(col)
    print(df_targets[col].value_counts())
    print('')

PRIM_CONTRIBUTORY_CAUSE
Failures to Follow Traffic Rules    172021
Driver Behavior and Awareness        27045
Environmental Factors                 8725
Vehicle-related Issues                2363
Other Specific Circumstances          1170
Name: PRIM_CONTRIBUTORY_CAUSE, dtype: int64



## splitting into ordinal and nominal

In [30]:
'''
assign to df_cat_ordinal 
INTERSECTION_RELATED_I
NOT_RIGHT_OF_WAY_I
HIT_AND_RUN_I
and EXCEED_SPEED_LIMIT_I
'''
df_cat_ordinal = df_cat_dropped_cols[['INTERSECTION_RELATED_I',
                                        'NOT_RIGHT_OF_WAY_I',
                                        'HIT_AND_RUN_I',
                                        'EXCEED_SPEED_LIMIT_I'
                                        ]]

# assign the rest to nominal
df_cat_nominal = df_cat_dropped_cols.drop(columns=['INTERSECTION_RELATED_I',
                                        'NOT_RIGHT_OF_WAY_I',
                                        'HIT_AND_RUN_I',
                                        'EXCEED_SPEED_LIMIT_I'
                                        ])

### ordinal

In [31]:
# list nulls in ordinal
df_cat_ordinal.isna().sum()

INTERSECTION_RELATED_I    149918
NOT_RIGHT_OF_WAY_I        203603
HIT_AND_RUN_I             160607
EXCEED_SPEED_LIMIT_I      210799
dtype: int64

In [32]:
for col in df_cat_ordinal.columns:
    print(col)
    print(df_cat_ordinal[col].value_counts())
    print('')

INTERSECTION_RELATED_I
Y    58740
N     2666
Name: INTERSECTION_RELATED_I, dtype: int64

NOT_RIGHT_OF_WAY_I
Y    6958
N     763
Name: NOT_RIGHT_OF_WAY_I, dtype: int64

HIT_AND_RUN_I
Y    47982
N     2735
Name: HIT_AND_RUN_I, dtype: int64

EXCEED_SPEED_LIMIT_I
Y    407
N    118
Name: EXCEED_SPEED_LIMIT_I, dtype: int64



In [33]:
# fill all nulls in ordinal with 'N'
df_cat_ordinal_filled = df_cat_ordinal.fillna('N')

In [34]:
df_cat_ordinal_filled.isna().sum()

INTERSECTION_RELATED_I    0
NOT_RIGHT_OF_WAY_I        0
HIT_AND_RUN_I             0
EXCEED_SPEED_LIMIT_I      0
dtype: int64

In [35]:
for col in df_cat_ordinal_filled.columns:
    print(col)
    print(df_cat_ordinal_filled[col].value_counts())
    print('')

INTERSECTION_RELATED_I
N    152584
Y     58740
Name: INTERSECTION_RELATED_I, dtype: int64

NOT_RIGHT_OF_WAY_I
N    204366
Y      6958
Name: NOT_RIGHT_OF_WAY_I, dtype: int64

HIT_AND_RUN_I
N    163342
Y     47982
Name: HIT_AND_RUN_I, dtype: int64

EXCEED_SPEED_LIMIT_I
N    210917
Y       407
Name: EXCEED_SPEED_LIMIT_I, dtype: int64



### nominal

In [36]:
# list nulls in nominal
df_cat_nominal.isna().sum()

CRASH_DATE_x                 0
TRAFFIC_CONTROL_DEVICE       0
DEVICE_CONDITION             0
WEATHER_CONDITION            0
LIGHTING_CONDITION           0
FIRST_CRASH_TYPE             0
TRAFFICWAY_TYPE              0
ALIGNMENT                    0
ROADWAY_SURFACE_COND         0
ROAD_DEFECT                  0
CRASH_TYPE                   0
DAMAGE                       0
DATE_POLICE_NOTIFIED         0
MOST_SEVERE_INJURY         165
CRASH_DATE_y                 0
UNIT_TYPE                  298
MAKE                      4643
MODEL                     4669
VEHICLE_DEFECT            4642
VEHICLE_TYPE              4642
VEHICLE_USE               4642
MANEUVER                  4642
FIRST_CONTACT_POINT       5029
dtype: int64

In [37]:
# assign the columnsn in df_cat_nominal with nulls to df_cat_nominal_null
df_cat_nominal_null = df_cat_nominal[['FIRST_CONTACT_POINT',
                                      'MANEUVER',
                                      'VEHICLE_USE',
                                      'VEHICLE_TYPE',
                                      'VEHICLE_DEFECT',
                                      'MODEL',
                                      'MAKE',
                                      'UNIT_TYPE',
                                      'MOST_SEVERE_INJURY']]

# assign the rest to df_cat_nominal_nonull
df_cat_nominal_nonull = df_cat_nominal.drop(columns=['FIRST_CONTACT_POINT',
                                                    'MANEUVER',
                                                    'VEHICLE_USE',
                                                    'VEHICLE_TYPE',
                                                    'VEHICLE_DEFECT',
                                                    'MODEL',
                                                    'MAKE',
                                                    'UNIT_TYPE',
                                                    'MOST_SEVERE_INJURY'])


In [38]:
# check that the split worked
df_cat_nominal_null.isna().sum()

FIRST_CONTACT_POINT    5029
MANEUVER               4642
VEHICLE_USE            4642
VEHICLE_TYPE           4642
VEHICLE_DEFECT         4642
MODEL                  4669
MAKE                   4643
UNIT_TYPE               298
MOST_SEVERE_INJURY      165
dtype: int64

In [39]:
df_cat_nominal_nonull.isna().sum()

CRASH_DATE_x              0
TRAFFIC_CONTROL_DEVICE    0
DEVICE_CONDITION          0
WEATHER_CONDITION         0
LIGHTING_CONDITION        0
FIRST_CRASH_TYPE          0
TRAFFICWAY_TYPE           0
ALIGNMENT                 0
ROADWAY_SURFACE_COND      0
ROAD_DEFECT               0
CRASH_TYPE                0
DAMAGE                    0
DATE_POLICE_NOTIFIED      0
CRASH_DATE_y              0
dtype: int64

In [40]:
# how many columns are in df_cat_nominal_null?
print(len(df_cat_nominal_null.columns))

# how many columns are in df_cat_nominal_nonull?
print(len(df_cat_nominal_nonull.columns))

9
14


In [41]:
print(9 + 15)

24


In [42]:
# how many columsn were in df_cat_nominal?
print(len(df_cat_nominal.columns))

23


In [43]:
for col in df_cat_nominal_null.columns:
    print(col)
    print(df_cat_nominal_null[col].value_counts())
    print('')

FIRST_CONTACT_POINT
FRONT                 45193
REAR                  29586
SIDE-RIGHT            13504
SIDE-LEFT             13420
FRONT-LEFT            12721
FRONT-LEFT-CORNER     12234
FRONT-RIGHT           12106
FRONT-RIGHT-CORNER    11673
REAR-LEFT             10099
UNKNOWN                9445
OTHER                  6198
REAR-RIGHT             5372
REAR-LEFT-CORNER       4574
TOTAL (ALL AREAS)      4037
REAR-RIGHT-CORNER      3693
SIDE-LEFT-REAR         2798
SIDE-RIGHT-REAR        2209
SIDE-LEFT-FRONT        1819
SIDE-RIGHT-FRONT       1603
ROOF                   1590
NONE                   1535
UNDER CARRIAGE          700
TOP                     186
Name: FIRST_CONTACT_POINT, dtype: int64

MANEUVER
STRAIGHT AHEAD                        104377
PARKED                                 18734
SLOW/STOP IN TRAFFIC                   18695
TURNING LEFT                           14658
BACKING                                 9080
TURNING RIGHT                           7708
PASSING/OVERTAKI

In [44]:
'''Fill the following columns with 'UNKNOWN' in df_cat_nominal_null:
MOST_SEVERE_INJURY
UNIT_TYPE
MAKE
MODEL
VEHICLE_DEFECT
FIRST_CONTACT_POINT
'''

df_cat_nominal_null['MOST_SEVERE_INJURY'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['UNIT_TYPE'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['MAKE'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['MODEL'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['VEHICLE_DEFECT'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['FIRST_CONTACT_POINT'].fillna('UNKNOWN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_nominal_null['MOST_SEVERE_INJURY'].fillna('UNKNOWN', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_nominal_null['UNIT_TYPE'].fillna('UNKNOWN', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_nominal_null['MAKE'].fillna('UNKNOWN', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

In [45]:
'''Fill the following columns with 'UNKNOWN/NA' in df_cat_nominal_null:
VEHICLE_TYPE
VEHICLE_USE
MANUEVER
'''

df_cat_nominal_null['VEHICLE_TYPE'].fillna('UNKNOWN/NA', inplace=True)
df_cat_nominal_null['VEHICLE_USE'].fillna('UNKNOWN/NA', inplace=True)
df_cat_nominal_null['MANEUVER'].fillna('UNKNOWN/NA', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_nominal_null['VEHICLE_TYPE'].fillna('UNKNOWN/NA', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_nominal_null['VEHICLE_USE'].fillna('UNKNOWN/NA', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_nominal_null['MANEUVER'].fillna('UNKNOWN/NA', inplace=True)


## Combining nominal and ordinal again

modifying this because the ordinal columsn don't seem to contribute anything

In [46]:
# combine df_cat_nominal_null and df_cat_nominal_nonull into df_cat_nominal_filled
df_cat_nominal_filled = pd.concat([df_cat_nominal_null, df_cat_nominal_nonull], axis=1)

# combine df_cat_nominal_filled and df_cat_ordinal_filled into df_cat_filled
df_cat_filled = pd.concat([df_cat_nominal_filled, df_cat_ordinal_filled], axis=1)

## Combining everything again

In [47]:
# Combine df_cat_filled and df_num into df_clean

df_clean = pd.concat([df_num, df_cat_filled], axis=1)

In [48]:
df_clean.head().T

Unnamed: 0,0,1,2,3,4
POSTED_SPEED_LIMIT,25,20,30,30,30
STREET_NO,33,4300,2198,2800,7739
NUM_UNITS,2,2,1,2,2
INJURIES_TOTAL,0.0,0.0,0.0,0.0,0.0
INJURIES_FATAL,0.0,0.0,0.0,0.0,0.0
INJURIES_INCAPACITATING,0.0,0.0,0.0,0.0,0.0
INJURIES_NON_INCAPACITATING,0.0,0.0,0.0,0.0,0.0
INJURIES_REPORTED_NOT_EVIDENT,0.0,0.0,0.0,0.0,0.0
INJURIES_NO_INDICATION,1.0,2.0,1.0,2.0,2.0
INJURIES_UNKNOWN,0.0,0.0,0.0,0.0,0.0


# Exporting the cleaned data

In [49]:
# export df_clean to csv
df_clean.to_csv('data/df_clean.csv')

In [50]:
df_clean.shape

(211324, 47)

In [51]:
df_clean.isna().sum()

POSTED_SPEED_LIMIT               0
STREET_NO                        0
NUM_UNITS                        0
INJURIES_TOTAL                   0
INJURIES_FATAL                   0
INJURIES_INCAPACITATING          0
INJURIES_NON_INCAPACITATING      0
INJURIES_REPORTED_NOT_EVIDENT    0
INJURIES_NO_INDICATION           0
INJURIES_UNKNOWN                 0
CRASH_HOUR                       0
CRASH_DAY_OF_WEEK                0
CRASH_MONTH                      0
LATITUDE                         0
LONGITUDE                        0
CRASH_UNIT_ID                    0
UNIT_NO                          0
NUM_PASSENGERS                   0
VEHICLE_YEAR                     0
OCCUPANT_CNT                     0
FIRST_CONTACT_POINT              0
MANEUVER                         0
VEHICLE_USE                      0
VEHICLE_TYPE                     0
VEHICLE_DEFECT                   0
MODEL                            0
MAKE                             0
UNIT_TYPE                        0
MOST_SEVERE_INJURY  