# Data Cleaning

## Data Sources

[crashes](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if)

[vehicles](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Vehicles/68nd-jvt3)

[people](https://data.cityofchicago.org/Transportation/Traffic-Crashes-People/u6pd-qa9d)

## Libraries

In [564]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [565]:
# imort 'data/merged.csv'
df = pd.read_csv('data/merged.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Initial Drop

In [566]:
'''
drom from df 
CMV_ID
USDOT_NO
CCMC_NO
ILCC_NO
COMMERCIAL_SRC
GVWR
CARRIER_NAME
CARRIER_STATE
CARRIER_CITY
HAZMAT_PLACARDS_I
HAZMAT_NAME
UN_NO
HAZMAT_PRESENT_I
HAZMAT_REPORT_I
HAZMAT_REPORT_NO
MCS_REPORT_I
MCS_REPORT_NO
HAZMAT_VIO_CAUSE_CRASH_I
MCS_VIO_CAUSE_CRASH_I
IDOT_PERMIT_NO
WIDE_LOAD_I
TRAILER1_WIDTH
TRAILER2_WIDTH
TRAILER1_LENGTH
TRAILER2_LENGTH
TOTAL_VEHICLE_LENGTH
AXLE_CNT
VEHICLE_CONFIG
CARGO_BODY_TYPE
LOAD_TYPE
HAZMAT_OUT_OF_SERVICE_I
MCS_OUT_OF_SERVICE_I
HAZMAT_CLASS
PHOTOS_TAKEN_I
STATEMENTS_TAKEN_I
DOORING_I
WORK_ZONE_I
WORK_ZONE_TYPE
WORKERS_PRESENT_I
TOWED_I
FIRE_I
TOWED_BY
TOWED_TO
AREA_00_I
AREA_01_I
AREA_02_I
AREA_03_I
AREA_04_I
AREA_05_I
AREA_06_I
AREA_07_I
AREA_08_I
AREA_09_I
AREA_10_I
AREA_11_I
AREA_12_I
AREA_99_I
'''

df.drop(columns=['CMV_ID',
                 'USDOT_NO',
                 'CCMC_NO',
                 'ILCC_NO',
                 'COMMERCIAL_SRC',
                 'GVWR',
                 'CARRIER_NAME',
                 'CARRIER_STATE',
                 'CARRIER_CITY',
                 'HAZMAT_PLACARDS_I',
                 'HAZMAT_NAME',
                 'UN_NO',
                 'HAZMAT_PRESENT_I',
                 'HAZMAT_REPORT_I',
                 'HAZMAT_REPORT_NO',
                 'MCS_REPORT_I',
                 'MCS_REPORT_NO',
                 'HAZMAT_VIO_CAUSE_CRASH_I',
                 'MCS_VIO_CAUSE_CRASH_I',
                 'IDOT_PERMIT_NO',
                 'WIDE_LOAD_I',
                 'TRAILER1_WIDTH',
                 'TRAILER2_WIDTH',
                 'TRAILER1_LENGTH',
                 'TRAILER2_LENGTH',
                 'TOTAL_VEHICLE_LENGTH',
                 'AXLE_CNT',
                 'VEHICLE_CONFIG',
                 'CARGO_BODY_TYPE',
                 'LOAD_TYPE',
                 'HAZMAT_OUT_OF_SERVICE_I',
                 'MCS_OUT_OF_SERVICE_I',
                 'HAZMAT_CLASS',
                 
                 'PHOTOS_TAKEN_I',
                 'STATEMENTS_TAKEN_I',
                 'DOORING_I',
                 'WORK_ZONE_I',
                 'WORK_ZONE_TYPE',
                 'WORKERS_PRESENT_I',
                 
                 'TOWED_I',
                 'FIRE_I',
                 
                 'TOWED_BY',
                 'TOWED_TO',
                 'AREA_00_I',
                 'AREA_01_I',
                 'AREA_02_I',
                 'AREA_03_I',
                 'AREA_04_I',
                 'AREA_05_I',
                 'AREA_06_I',
                 'AREA_07_I',
                 'AREA_08_I',
                 'AREA_09_I',
                 'AREA_10_I',
                 'AREA_11_I',
                 'AREA_12_I',
                 'AREA_99_I',
                 
                 'BEAT_OF_OCCURRENCE',
                 'VEHICLE_ID',
                 'LANE_CNT',
                 ], inplace=True)

In [567]:
df.shape

(1491794, 60)

In [568]:
# remove all rows where PRIM_CONTRIBUTORY_CAUSE is UNABLE TO DETERMINE and NOT APPLICABLE
df = df[df['PRIM_CONTRIBUTORY_CAUSE'] != 'NOT APPLICABLE']
df = df[df['PRIM_CONTRIBUTORY_CAUSE'] != 'UNABLE TO DETERMINE']



In [569]:
# check
df['PRIM_CONTRIBUTORY_CAUSE'].value_counts()

FAILING TO YIELD RIGHT-OF-WAY                                                       164915
FOLLOWING TOO CLOSELY                                                               151936
IMPROPER OVERTAKING/PASSING                                                          73063
FAILING TO REDUCE SPEED TO AVOID CRASH                                               66695
IMPROPER BACKING                                                                     58478
IMPROPER LANE USAGE                                                                  54856
IMPROPER TURNING/NO SIGNAL                                                           49008
DRIVING SKILLS/KNOWLEDGE/EXPERIENCE                                                  47434
DISREGARDING TRAFFIC SIGNALS                                                         30303
WEATHER                                                                              21949
OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER     20053

In [570]:
df.shape

(845294, 60)

In [571]:
df.head().T

Unnamed: 0,0,1,2,3,6
CRASH_RECORD_ID,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,05b1982cdba5d8a00e7e76ad1ecdab0e598429f78481d2...
RD_NO_x,JC199149,JC199149,JB422857,JB422857,JF378711
CRASH_DATE_EST_I,,,,,
CRASH_DATE_x,03/25/2019 02:43:00 PM,03/25/2019 02:43:00 PM,09/05/2018 08:40:00 AM,09/05/2018 08:40:00 AM,08/29/2022 11:30:00 AM
POSTED_SPEED_LIMIT,30,30,30,30,30
TRAFFIC_CONTROL_DEVICE,TRAFFIC SIGNAL,TRAFFIC SIGNAL,NO CONTROLS,NO CONTROLS,TRAFFIC SIGNAL
DEVICE_CONDITION,FUNCTIONING PROPERLY,FUNCTIONING PROPERLY,NO CONTROLS,NO CONTROLS,FUNCTIONING PROPERLY
WEATHER_CONDITION,CLEAR,CLEAR,CLEAR,CLEAR,CLEAR
LIGHTING_CONDITION,DAYLIGHT,DAYLIGHT,DAYLIGHT,DAYLIGHT,DAYLIGHT
FIRST_CRASH_TYPE,TURNING,TURNING,ANGLE,ANGLE,REAR END


# fixme I had to move this code to later, and it's redundant, so I just removed the dataframe from being merged back later

In [572]:
# '''Also drop the following columns:
# EXCEED_SPEED_LIMIT_I
# HIT_AND_RUN_I
# NOT_RIGHT_OF_WAY_I
# INTERSECTION_RELATED_I	
# '''

# df.drop(columns=['EXCEED_SPEED_LIMIT_I',
#                  'HIT_AND_RUN_I',
#                  'NOT_RIGHT_OF_WAY_I',
#                  'INTERSECTION_RELATED_I',
#                  ], inplace=True)

# df.shape

## Chopping down the data

There's just too much

Do after the initial clean to get a better sample. 

In [573]:
df.shape

(845294, 60)

In [574]:
df = df.sample(frac=0.01, random_state=42)

In [575]:
df.shape

(8453, 60)

In [576]:
# Define the target groups
target_groups = {
    'Failures to Follow Traffic Rules': [
        'FAILING TO YIELD RIGHT-OF-WAY',
        'FOLLOWING TOO CLOSELY',
        'IMPROPER OVERTAKING/PASSING',
        'FAILING TO REDUCE SPEED TO AVOID CRASH',
        'IMPROPER BACKING',
        'IMPROPER LANE USAGE',
        'IMPROPER TURNING/NO SIGNAL',
        'DISREGARDING TRAFFIC SIGNALS',
        'DISREGARDING STOP SIGN',
        'DISREGARDING OTHER TRAFFIC SIGNS',
        'DISREGARDING YIELD SIGN',
        'EXCEEDING SAFE SPEED FOR CONDITIONS', 
        'DRIVING ON WRONG SIDE/WRONG WAY', 
        'EXCEEDING AUTHORIZED SPEED LIMIT',
        'TURNING RIGHT ON RED', 
        'DISREGARDING ROAD MARKINGS'
    ],
    'Driver Behavior and Awareness': [
        'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE',
        'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER',
        'DISTRACTION - FROM INSIDE VEHICLE',
        'PHYSICAL CONDITION OF DRIVER',
        'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)',
        'DISTRACTION - FROM OUTSIDE VEHICLE',
        'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)',
        'CELL PHONE USE OTHER THAN TEXTING',
        'TEXTING', 
        'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)'
    ],
    'Environmental Factors': [
        'WEATHER',
        'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)',
        'ROAD CONSTRUCTION/MAINTENANCE',
        'ROAD ENGINEERING/SURFACE/MARKING DEFECTS'
    ],
    'Vehicle-related Issues': [
        'EQUIPMENT - VEHICLE CONDITION'
    ],
    'Other Specific Circumstances': [
        'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST',
        'ANIMAL',
        'RELATED TO BUS STOP',
        'BICYCLE ADVANCING LEGALLY ON RED LIGHT',
        'OBSTRUCTED CROSSWALKS',
        'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT',
        'PASSING STOPPED SCHOOL BUS'
    ]
}

# Group the targets based on the defined groups
for group, targets in target_groups.items():
    df.loc[df['PRIM_CONTRIBUTORY_CAUSE'].isin(targets), 'PRIM_CONTRIBUTORY_CAUSE'] = group

# Cut out the df_targets with the grouped targets
df_targets = df[['PRIM_CONTRIBUTORY_CAUSE']]

# Print the unique grouped targets
print(df_targets['PRIM_CONTRIBUTORY_CAUSE'].unique())


['Driver Behavior and Awareness' 'Failures to Follow Traffic Rules'
 'Vehicle-related Issues' 'Other Specific Circumstances'
 'Environmental Factors']


## Cleaning Leftover Columns

### Split Into categorical and numerical

Now that they have been merged on CRASH_RECORD_ID, I'm going to remove that column, and create a new index

In [577]:
df = df.drop('CRASH_RECORD_ID', axis=1)

In [578]:
df = df.reset_index(drop=True)

In [579]:
index = df.index

In [580]:
print(index)

RangeIndex(start=0, stop=8453, step=1)


In [581]:
# split df into categorical and numerical dataframes
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [582]:
# df_cat = categorical_data.reset_index(drop=True)
# df_num = numerical_data.reset_index(drop=True)

#### Numerical Cleaning

Okay so everything that is left over with the numerical columns should be able to be filled with 0.

In [583]:
# check df_num for nulls
df_num.isnull().sum()

POSTED_SPEED_LIMIT                  0
STREET_NO                           0
NUM_UNITS                           0
INJURIES_TOTAL                      1
INJURIES_FATAL                      1
INJURIES_INCAPACITATING             1
INJURIES_NON_INCAPACITATING         1
INJURIES_REPORTED_NOT_EVIDENT       1
INJURIES_NO_INDICATION              1
INJURIES_UNKNOWN                    1
CRASH_HOUR                          0
CRASH_DAY_OF_WEEK                   0
CRASH_MONTH                         0
LATITUDE                           54
LONGITUDE                          54
CRASH_UNIT_ID                       0
UNIT_NO                             0
NUM_PASSENGERS                   6997
VEHICLE_YEAR                     1142
OCCUPANT_CNT                      180
dtype: int64

In [584]:
# fill all missing values in df_num with 0
df_num.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


#### categorical cleaning

In [585]:
df_cat.isna().sum()

RD_NO_x                      53
CRASH_DATE_EST_I           8029
CRASH_DATE_x                  0
TRAFFIC_CONTROL_DEVICE        0
DEVICE_CONDITION              0
WEATHER_CONDITION             0
LIGHTING_CONDITION            0
FIRST_CRASH_TYPE              0
TRAFFICWAY_TYPE               0
ALIGNMENT                     0
ROADWAY_SURFACE_COND          0
ROAD_DEFECT                   0
REPORT_TYPE                 277
CRASH_TYPE                    0
INTERSECTION_RELATED_I     6016
NOT_RIGHT_OF_WAY_I         8152
HIT_AND_RUN_I              6456
DAMAGE                        0
DATE_POLICE_NOTIFIED          0
PRIM_CONTRIBUTORY_CAUSE       0
SEC_CONTRIBUTORY_CAUSE        0
STREET_DIRECTION              0
STREET_NAME                   0
MOST_SEVERE_INJURY            1
LOCATION                     54
RD_NO_y                      57
CRASH_DATE_y                  0
UNIT_TYPE                    13
CMRC_VEH_I                 8266
MAKE                        180
MODEL                       180
LIC_PLAT

In [586]:
# print out the value_counts for every column in df_cat
for col in df_cat.columns:
    print(col)
    print(df_cat[col].value_counts())
    print('')

RD_NO_x
JA249116    2
JB518075    2
JA288295    2
JA506271    2
JB165417    2
           ..
JA441715    1
JC251655    1
JE375415    1
JF132628    1
JC129554    1
Name: RD_NO_x, Length: 8362, dtype: int64

CRASH_DATE_EST_I
Y    362
N     62
Name: CRASH_DATE_EST_I, dtype: int64

CRASH_DATE_x
03/17/2019 04:15:00 PM    3
11/15/2019 07:30:00 AM    2
05/03/2019 05:10:00 PM    2
06/01/2017 02:00:00 PM    2
04/12/2023 08:00:00 AM    2
                         ..
02/28/2018 03:00:00 PM    1
03/15/2023 08:00:00 AM    1
01/07/2018 05:00:00 PM    1
06/25/2020 01:14:00 PM    1
06/04/2021 05:20:00 PM    1
Name: CRASH_DATE_x, Length: 8353, dtype: int64

TRAFFIC_CONTROL_DEVICE
NO CONTROLS                 4314
TRAFFIC SIGNAL              2842
STOP SIGN/FLASHER           1015
UNKNOWN                      156
OTHER                         55
LANE USE MARKING              15
OTHER REG. SIGN               10
YIELD                         10
RAILROAD CROSSING GATE         8
SCHOOL ZONE                    6


In [587]:
df_cat_dropped_cols = df_cat.drop(columns=['RD_NO_x',
                'CRASH_DATE_EST_I', 
                'REPORT_TYPE', 
                'STREET_DIRECTION',
                'STREET_NAME',
                'LOCATION', 
                'RD_NO_y', 
                'CMRC_VEH_I', 
                'LIC_PLATE_STATE', 
                'TRAVEL_DIRECTION'
                 ])

In [588]:
df_cat_dropped_cols.isna().sum()

CRASH_DATE_x                  0
TRAFFIC_CONTROL_DEVICE        0
DEVICE_CONDITION              0
WEATHER_CONDITION             0
LIGHTING_CONDITION            0
FIRST_CRASH_TYPE              0
TRAFFICWAY_TYPE               0
ALIGNMENT                     0
ROADWAY_SURFACE_COND          0
ROAD_DEFECT                   0
CRASH_TYPE                    0
INTERSECTION_RELATED_I     6016
NOT_RIGHT_OF_WAY_I         8152
HIT_AND_RUN_I              6456
DAMAGE                        0
DATE_POLICE_NOTIFIED          0
PRIM_CONTRIBUTORY_CAUSE       0
SEC_CONTRIBUTORY_CAUSE        0
MOST_SEVERE_INJURY            1
CRASH_DATE_y                  0
UNIT_TYPE                    13
MAKE                        180
MODEL                       180
VEHICLE_DEFECT              180
VEHICLE_TYPE                180
VEHICLE_USE                 180
MANEUVER                    180
EXCEED_SPEED_LIMIT_I       8430
FIRST_CONTACT_POINT         199
dtype: int64

## Getting target variable

In [589]:
'''
remove
PRIM_CONTRIBUTORY_CAUSE
and
SEC_CONTRIBUTORY_CAUSE
from df_cat_dropped_cols
assign it to it's own dataframe called df_targets,
and then export df_targets to csv
'''

# Specifically I'm going to make prim cause the target and remove secondary, maybe I'll come back to it if I have time.
df_targets = df_cat_dropped_cols[['PRIM_CONTRIBUTORY_CAUSE']]
df_cat_dropped_cols.drop(columns=['PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE'], inplace=True)
df_targets.to_csv('data/df_targets.csv')

In [590]:
df_targets.isna().sum()

PRIM_CONTRIBUTORY_CAUSE    0
dtype: int64

In [591]:
df_targets.dtypes

PRIM_CONTRIBUTORY_CAUSE    object
dtype: object

In [592]:
df_targets.value_counts()

# give me the value counts for each column in df_targets
for col in df_targets.columns:
    print(col)
    print(df_targets[col].value_counts())
    print('')

PRIM_CONTRIBUTORY_CAUSE
Failures to Follow Traffic Rules    6920
Driver Behavior and Awareness       1049
Environmental Factors                316
Vehicle-related Issues               108
Other Specific Circumstances          60
Name: PRIM_CONTRIBUTORY_CAUSE, dtype: int64



## splitting into ordinal and nominal

In [593]:
'''
assign to df_cat_ordinal 
INTERSECTION_RELATED_I
NOT_RIGHT_OF_WAY_I
HIT_AND_RUN_I
and EXCEED_SPEED_LIMIT_I
'''
df_cat_ordinal = df_cat_dropped_cols[['INTERSECTION_RELATED_I',
                                        'NOT_RIGHT_OF_WAY_I',
                                        'HIT_AND_RUN_I',
                                        'EXCEED_SPEED_LIMIT_I'
                                        ]]

# assign the rest to nominal
df_cat_nominal = df_cat_dropped_cols.drop(columns=['INTERSECTION_RELATED_I',
                                        'NOT_RIGHT_OF_WAY_I',
                                        'HIT_AND_RUN_I',
                                        'EXCEED_SPEED_LIMIT_I'
                                        ])

### ordinal

In [594]:
# list nulls in ordinal
df_cat_ordinal.isna().sum()

INTERSECTION_RELATED_I    6016
NOT_RIGHT_OF_WAY_I        8152
HIT_AND_RUN_I             6456
EXCEED_SPEED_LIMIT_I      8430
dtype: int64

In [595]:
for col in df_cat_ordinal.columns:
    print(col)
    print(df_cat_ordinal[col].value_counts())
    print('')

INTERSECTION_RELATED_I
Y    2327
N     110
Name: INTERSECTION_RELATED_I, dtype: int64

NOT_RIGHT_OF_WAY_I
Y    274
N     27
Name: NOT_RIGHT_OF_WAY_I, dtype: int64

HIT_AND_RUN_I
Y    1884
N     113
Name: HIT_AND_RUN_I, dtype: int64

EXCEED_SPEED_LIMIT_I
Y    18
N     5
Name: EXCEED_SPEED_LIMIT_I, dtype: int64



In [596]:
# fill all nulls in ordinal with 'N'
df_cat_ordinal_filled = df_cat_ordinal.fillna('N')

In [597]:
df_cat_ordinal_filled.isna().sum()

INTERSECTION_RELATED_I    0
NOT_RIGHT_OF_WAY_I        0
HIT_AND_RUN_I             0
EXCEED_SPEED_LIMIT_I      0
dtype: int64

In [598]:
for col in df_cat_ordinal_filled.columns:
    print(col)
    print(df_cat_ordinal_filled[col].value_counts())
    print('')

INTERSECTION_RELATED_I
N    6126
Y    2327
Name: INTERSECTION_RELATED_I, dtype: int64

NOT_RIGHT_OF_WAY_I
N    8179
Y     274
Name: NOT_RIGHT_OF_WAY_I, dtype: int64

HIT_AND_RUN_I
N    6569
Y    1884
Name: HIT_AND_RUN_I, dtype: int64

EXCEED_SPEED_LIMIT_I
N    8435
Y      18
Name: EXCEED_SPEED_LIMIT_I, dtype: int64



### nominal

In [599]:
# list nulls in nominal
df_cat_nominal.isna().sum()

CRASH_DATE_x                0
TRAFFIC_CONTROL_DEVICE      0
DEVICE_CONDITION            0
WEATHER_CONDITION           0
LIGHTING_CONDITION          0
FIRST_CRASH_TYPE            0
TRAFFICWAY_TYPE             0
ALIGNMENT                   0
ROADWAY_SURFACE_COND        0
ROAD_DEFECT                 0
CRASH_TYPE                  0
DAMAGE                      0
DATE_POLICE_NOTIFIED        0
MOST_SEVERE_INJURY          1
CRASH_DATE_y                0
UNIT_TYPE                  13
MAKE                      180
MODEL                     180
VEHICLE_DEFECT            180
VEHICLE_TYPE              180
VEHICLE_USE               180
MANEUVER                  180
FIRST_CONTACT_POINT       199
dtype: int64

In [600]:
# assign the columnsn in df_cat_nominal with nulls to df_cat_nominal_null
df_cat_nominal_null = df_cat_nominal[['FIRST_CONTACT_POINT',
                                      'MANEUVER',
                                      'VEHICLE_USE',
                                      'VEHICLE_TYPE',
                                      'VEHICLE_DEFECT',
                                      'MODEL',
                                      'MAKE',
                                      'UNIT_TYPE',
                                      'MOST_SEVERE_INJURY']]

# assign the rest to df_cat_nominal_nonull
df_cat_nominal_nonull = df_cat_nominal.drop(columns=['FIRST_CONTACT_POINT',
                                                    'MANEUVER',
                                                    'VEHICLE_USE',
                                                    'VEHICLE_TYPE',
                                                    'VEHICLE_DEFECT',
                                                    'MODEL',
                                                    'MAKE',
                                                    'UNIT_TYPE',
                                                    'MOST_SEVERE_INJURY'])


In [601]:
# check that the split worked
df_cat_nominal_null.isna().sum()

FIRST_CONTACT_POINT    199
MANEUVER               180
VEHICLE_USE            180
VEHICLE_TYPE           180
VEHICLE_DEFECT         180
MODEL                  180
MAKE                   180
UNIT_TYPE               13
MOST_SEVERE_INJURY       1
dtype: int64

In [602]:
df_cat_nominal_nonull.isna().sum()

CRASH_DATE_x              0
TRAFFIC_CONTROL_DEVICE    0
DEVICE_CONDITION          0
WEATHER_CONDITION         0
LIGHTING_CONDITION        0
FIRST_CRASH_TYPE          0
TRAFFICWAY_TYPE           0
ALIGNMENT                 0
ROADWAY_SURFACE_COND      0
ROAD_DEFECT               0
CRASH_TYPE                0
DAMAGE                    0
DATE_POLICE_NOTIFIED      0
CRASH_DATE_y              0
dtype: int64

In [603]:
# how many columns are in df_cat_nominal_null?
print(len(df_cat_nominal_null.columns))

# how many columns are in df_cat_nominal_nonull?
print(len(df_cat_nominal_nonull.columns))

9
14


In [604]:
print(9 + 15)

24


In [605]:
# how many columsn were in df_cat_nominal?
print(len(df_cat_nominal.columns))

23


In [606]:
for col in df_cat_nominal_null.columns:
    print(col)
    print(df_cat_nominal_null[col].value_counts())
    print('')

FIRST_CONTACT_POINT
FRONT                 1816
REAR                  1204
FRONT-LEFT             564
SIDE-LEFT              541
SIDE-RIGHT             535
FRONT-LEFT-CORNER      493
FRONT-RIGHT            484
FRONT-RIGHT-CORNER     462
REAR-LEFT              392
UNKNOWN                357
OTHER                  254
REAR-RIGHT             195
REAR-LEFT-CORNER       177
REAR-RIGHT-CORNER      163
TOTAL (ALL AREAS)      153
SIDE-LEFT-REAR          99
SIDE-RIGHT-REAR         88
SIDE-RIGHT-FRONT        66
ROOF                    64
SIDE-LEFT-FRONT         62
NONE                    51
UNDER CARRIAGE          28
TOP                      6
Name: FIRST_CONTACT_POINT, dtype: int64

MANEUVER
STRAIGHT AHEAD                        4166
PARKED                                 771
SLOW/STOP IN TRAFFIC                   725
TURNING LEFT                           613
BACKING                                360
TURNING RIGHT                          298
PASSING/OVERTAKING                     267
UNKNOWN/

In [607]:
'''Fill the following columns with 'UNKNOWN' in df_cat_nominal_null:
MOST_SEVERE_INJURY
UNIT_TYPE
MAKE
MODEL
VEHICLE_DEFECT
FIRST_CONTACT_POINT
'''

df_cat_nominal_null['MOST_SEVERE_INJURY'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['UNIT_TYPE'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['MAKE'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['MODEL'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['VEHICLE_DEFECT'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['FIRST_CONTACT_POINT'].fillna('UNKNOWN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [608]:
'''Fill the following columns with 'UNKNOWN/NA' in df_cat_nominal_null:
VEHICLE_TYPE
VEHICLE_USE
MANUEVER
'''

df_cat_nominal_null['VEHICLE_TYPE'].fillna('UNKNOWN/NA', inplace=True)
df_cat_nominal_null['VEHICLE_USE'].fillna('UNKNOWN/NA', inplace=True)
df_cat_nominal_null['MANEUVER'].fillna('UNKNOWN/NA', inplace=True)

## Combining nominal and ordinal again

modifying this because the ordinal columsn don't seem to contribute anything

In [609]:
# combine df_cat_nominal_null and df_cat_nominal_nonull into df_cat_nominal_filled
df_cat_nominal_filled = pd.concat([df_cat_nominal_null, df_cat_nominal_nonull], axis=1)

# combine df_cat_nominal_filled and df_cat_ordinal_filled into df_cat_filled
# df_cat_filled = pd.concat([df_cat_nominal_filled, df_cat_ordinal_filled], axis=1)

## Combining everything again

In [610]:
# Combine df_cat_filled and df_num into df_clean

df_clean = pd.concat([df_num, df_cat_nominal_filled], axis=1)

In [611]:
df_clean.head().T

Unnamed: 0,0,1,2,3,4
POSTED_SPEED_LIMIT,25,20,30,30,30
STREET_NO,33,4300,2198,2800,7739
NUM_UNITS,2,2,1,2,2
INJURIES_TOTAL,0,0,0,0,0
INJURIES_FATAL,0,0,0,0,0
INJURIES_INCAPACITATING,0,0,0,0,0
INJURIES_NON_INCAPACITATING,0,0,0,0,0
INJURIES_REPORTED_NOT_EVIDENT,0,0,0,0,0
INJURIES_NO_INDICATION,1,2,1,2,2
INJURIES_UNKNOWN,0,0,0,0,0


# Exporting the cleaned data

In [612]:
# export df_clean to csv
df_clean.to_csv('data/df_clean.csv')

In [613]:
df_clean.shape

(8453, 43)

In [614]:
df_clean.isna().sum()

POSTED_SPEED_LIMIT               0
STREET_NO                        0
NUM_UNITS                        0
INJURIES_TOTAL                   0
INJURIES_FATAL                   0
INJURIES_INCAPACITATING          0
INJURIES_NON_INCAPACITATING      0
INJURIES_REPORTED_NOT_EVIDENT    0
INJURIES_NO_INDICATION           0
INJURIES_UNKNOWN                 0
CRASH_HOUR                       0
CRASH_DAY_OF_WEEK                0
CRASH_MONTH                      0
LATITUDE                         0
LONGITUDE                        0
CRASH_UNIT_ID                    0
UNIT_NO                          0
NUM_PASSENGERS                   0
VEHICLE_YEAR                     0
OCCUPANT_CNT                     0
FIRST_CONTACT_POINT              0
MANEUVER                         0
VEHICLE_USE                      0
VEHICLE_TYPE                     0
VEHICLE_DEFECT                   0
MODEL                            0
MAKE                             0
UNIT_TYPE                        0
MOST_SEVERE_INJURY  