# Data Cleaning

## Data Sources

[crashes](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if)

[vehicles](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Vehicles/68nd-jvt3)

[people](https://data.cityofchicago.org/Transportation/Traffic-Crashes-People/u6pd-qa9d)

## Libraries

In [201]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer


## Importing The Data

In [202]:
d_crashes = pd.read_csv('data/Traffic_Crashes_-_Crashes.csv')
d_vehicles = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Look at the dimensions of the matrices

In [203]:
d_crashes.shape

(732329, 49)

In [204]:
d_crashes.head()

Unnamed: 0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,JC199149,,03/25/2019 02:43:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,1.0,2.0,0.0,14,2,3,41.884547,-87.641201,POINT (-87.64120093714 41.884547224337)
1,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,JB422857,,09/05/2018 08:40:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,0.0,0.0,2.0,0.0,8,4,9,41.968562,-87.740659,POINT (-87.740659314632 41.968562453871)
2,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,JF318029,,07/15/2022 12:45:00 AM,30,UNKNOWN,UNKNOWN,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,...,0.0,0.0,2.0,0.0,0,6,7,41.886336,-87.716203,POINT (-87.716203130599 41.886336409761)
3,05b1982cdba5d8a00e7e76ad1ecdab0e598429f78481d2...,JF378711,,08/29/2022 11:30:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,3.0,0.0,11,2,8,41.749348,-87.721097,POINT (-87.721096727406 41.749348170421)
4,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,JF324552,,07/15/2022 06:50:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,2.0,0.0,18,6,7,41.925111,-87.667997,POINT (-87.667997321599 41.925110815832)


In [205]:
d_vehicles.shape

(1491796, 72)

In [206]:
d_vehicles.head()

Unnamed: 0,CRASH_UNIT_ID,CRASH_RECORD_ID,RD_NO,CRASH_DATE,UNIT_NO,UNIT_TYPE,NUM_PASSENGERS,VEHICLE_ID,CMRC_VEH_I,MAKE,...,TRAILER1_LENGTH,TRAILER2_LENGTH,TOTAL_VEHICLE_LENGTH,AXLE_CNT,VEHICLE_CONFIG,CARGO_BODY_TYPE,LOAD_TYPE,HAZMAT_OUT_OF_SERVICE_I,MCS_OUT_OF_SERVICE_I,HAZMAT_CLASS
0,1554880,91a5d08b2b701f2d37cbb52ecdbeb09579bc7f2ebc60b3...,JG223284,04/14/2023 02:05:00 PM,1,DRIVER,,1478881.0,,FORD,...,,,,,,,,,,
1,749947,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,JC451435,09/28/2019 03:30:00 AM,1,DRIVER,,834816.0,,HONDA,...,,,,,,,,,,
2,749949,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,JC451435,09/28/2019 03:30:00 AM,2,PARKED,,834819.0,,TOYOTA,...,,,,,,,,,,
3,749950,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,JC451435,09/28/2019 03:30:00 AM,3,PARKED,,834817.0,,GENERAL MOTORS CORPORATION (GMC),...,,,,,,,,,,
4,1554881,91a5d08b2b701f2d37cbb52ecdbeb09579bc7f2ebc60b3...,JG223284,04/14/2023 02:05:00 PM,2,DRIVER,1.0,1478892.0,,ISUZU,...,,,,,,,,,,


### Merging The Frames

In [207]:
# Merge d_crashes and d_vehicles on CRASH_RECORD_ID
df = pd.merge(d_crashes, d_vehicles, on='CRASH_RECORD_ID')

In [208]:
df.shape

(1491794, 120)

In [209]:
df.head()

Unnamed: 0,CRASH_RECORD_ID,RD_NO_x,CRASH_DATE_EST_I,CRASH_DATE_x,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,TRAILER1_LENGTH,TRAILER2_LENGTH,TOTAL_VEHICLE_LENGTH,AXLE_CNT,VEHICLE_CONFIG,CARGO_BODY_TYPE,LOAD_TYPE,HAZMAT_OUT_OF_SERVICE_I,MCS_OUT_OF_SERVICE_I,HAZMAT_CLASS
0,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,JC199149,,03/25/2019 02:43:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,,,,,,,,,,
1,79c7a2ce89f446262efd86df3d72d18b04ba487024b7c4...,JC199149,,03/25/2019 02:43:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,,,,,,,,,,
2,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,JB422857,,09/05/2018 08:40:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,,,,,,,,,,
3,792b539deaaad65ee5b4a9691d927a34d298eb33d42af0...,JB422857,,09/05/2018 08:40:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,,,,,,,,,,
4,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,JF318029,,07/15/2022 12:45:00 AM,30,UNKNOWN,UNKNOWN,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,...,,,,,,,,,,


## Chopping down the data

There's just too much

In [210]:
df = df.sample(frac=0.5, random_state=42)

## Check dimensions

In [211]:
print(d_crashes.shape[1] + d_vehicles.shape[1])

121


## Initial Drop

In [212]:
'''
drom from df 
CMV_ID
USDOT_NO
CCMC_NO
ILCC_NO
COMMERCIAL_SRC
GVWR
CARRIER_NAME
CARRIER_STATE
CARRIER_CITY
HAZMAT_PLACARDS_I
HAZMAT_NAME
UN_NO
HAZMAT_PRESENT_I
HAZMAT_REPORT_I
HAZMAT_REPORT_NO
MCS_REPORT_I
MCS_REPORT_NO
HAZMAT_VIO_CAUSE_CRASH_I
MCS_VIO_CAUSE_CRASH_I
IDOT_PERMIT_NO
WIDE_LOAD_I
TRAILER1_WIDTH
TRAILER2_WIDTH
TRAILER1_LENGTH
TRAILER2_LENGTH
TOTAL_VEHICLE_LENGTH
AXLE_CNT
VEHICLE_CONFIG
CARGO_BODY_TYPE
LOAD_TYPE
HAZMAT_OUT_OF_SERVICE_I
MCS_OUT_OF_SERVICE_I
HAZMAT_CLASS
PHOTOS_TAKEN_I
STATEMENTS_TAKEN_I
DOORING_I
WORK_ZONE_I
WORK_ZONE_TYPE
WORKERS_PRESENT_I
TOWED_I
FIRE_I
TOWED_BY
TOWED_TO
AREA_00_I
AREA_01_I
AREA_02_I
AREA_03_I
AREA_04_I
AREA_05_I
AREA_06_I
AREA_07_I
AREA_08_I
AREA_09_I
AREA_10_I
AREA_11_I
AREA_12_I
AREA_99_I
'''

df.drop(columns=['CMV_ID',
                 'USDOT_NO',
                 'CCMC_NO',
                 'ILCC_NO',
                 'COMMERCIAL_SRC',
                 'GVWR',
                 'CARRIER_NAME',
                 'CARRIER_STATE',
                 'CARRIER_CITY',
                 'HAZMAT_PLACARDS_I',
                 'HAZMAT_NAME',
                 'UN_NO',
                 'HAZMAT_PRESENT_I',
                 'HAZMAT_REPORT_I',
                 'HAZMAT_REPORT_NO',
                 'MCS_REPORT_I',
                 'MCS_REPORT_NO',
                 'HAZMAT_VIO_CAUSE_CRASH_I',
                 'MCS_VIO_CAUSE_CRASH_I',
                 'IDOT_PERMIT_NO',
                 'WIDE_LOAD_I',
                 'TRAILER1_WIDTH',
                 'TRAILER2_WIDTH',
                 'TRAILER1_LENGTH',
                 'TRAILER2_LENGTH',
                 'TOTAL_VEHICLE_LENGTH',
                 'AXLE_CNT',
                 'VEHICLE_CONFIG',
                 'CARGO_BODY_TYPE',
                 'LOAD_TYPE',
                 'HAZMAT_OUT_OF_SERVICE_I',
                 'MCS_OUT_OF_SERVICE_I',
                 'HAZMAT_CLASS',
                 
                 'PHOTOS_TAKEN_I',
                 'STATEMENTS_TAKEN_I',
                 'DOORING_I',
                 'WORK_ZONE_I',
                 'WORK_ZONE_TYPE',
                 'WORKERS_PRESENT_I',
                 
                 'TOWED_I',
                 'FIRE_I',
                 
                 'TOWED_BY',
                 'TOWED_TO',
                 'AREA_00_I',
                 'AREA_01_I',
                 'AREA_02_I',
                 'AREA_03_I',
                 'AREA_04_I',
                 'AREA_05_I',
                 'AREA_06_I',
                 'AREA_07_I',
                 'AREA_08_I',
                 'AREA_09_I',
                 'AREA_10_I',
                 'AREA_11_I',
                 'AREA_12_I',
                 'AREA_99_I',
                 
                 'BEAT_OF_OCCURRENCE',
                 'VEHICLE_ID',
                 'LANE_CNT',
                 ], inplace=True)

In [213]:
df.shape

(745897, 60)

## Cleaning Leftover Columns

### Split Into categorical and numerical

Now that they have been merged on CRASH_RECORD_ID, I'm going to remove that column, and create a new index

In [214]:
df = df.drop('CRASH_RECORD_ID', axis=1)

In [215]:
df = df.reset_index(drop=True)

In [216]:
index = df.index

In [217]:
print(index)

RangeIndex(start=0, stop=745897, step=1)


In [218]:
# split df into categorical and numerical dataframes
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [219]:
# df_cat = categorical_data.reset_index(drop=True)
# df_num = numerical_data.reset_index(drop=True)

#### Numerical Cleaning

Okay so everything that is left over with the numerical columns should be able to be filled with 0.

In [220]:
# check df_num for nulls
df_num.isnull().sum()

POSTED_SPEED_LIMIT                    0
STREET_NO                             0
NUM_UNITS                             0
INJURIES_TOTAL                     1277
INJURIES_FATAL                     1277
INJURIES_INCAPACITATING            1277
INJURIES_NON_INCAPACITATING        1277
INJURIES_REPORTED_NOT_EVIDENT      1277
INJURIES_NO_INDICATION             1277
INJURIES_UNKNOWN                   1277
CRASH_HOUR                            0
CRASH_DAY_OF_WEEK                     0
CRASH_MONTH                           0
LATITUDE                           4695
LONGITUDE                          4695
CRASH_UNIT_ID                         0
UNIT_NO                               0
NUM_PASSENGERS                   635283
VEHICLE_YEAR                     134411
OCCUPANT_CNT                      16823
dtype: int64

In [221]:
# fill all missing values in df_num with 0
df_num.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


#### categorical cleaning

In [222]:
df_cat.isna().sum()

RD_NO_x                      4506
CRASH_DATE_EST_I           689719
CRASH_DATE_x                    0
TRAFFIC_CONTROL_DEVICE          0
DEVICE_CONDITION                0
WEATHER_CONDITION               0
LIGHTING_CONDITION              0
FIRST_CRASH_TYPE                0
TRAFFICWAY_TYPE                 0
ALIGNMENT                       0
ROADWAY_SURFACE_COND            0
ROAD_DEFECT                     0
REPORT_TYPE                 23883
CRASH_TYPE                      0
INTERSECTION_RELATED_I     571980
NOT_RIGHT_OF_WAY_I         715536
HIT_AND_RUN_I              508950
DAMAGE                          0
DATE_POLICE_NOTIFIED            0
PRIM_CONTRIBUTORY_CAUSE         0
SEC_CONTRIBUTORY_CAUSE          0
STREET_DIRECTION                3
STREET_NAME                     2
MOST_SEVERE_INJURY           1290
LOCATION                     4695
RD_NO_y                      4890
CRASH_DATE_y                    0
UNIT_TYPE                    1005
CMRC_VEH_I                 731969
MAKE          

In [223]:
# print out the value_counts for every column in df_cat
for col in df_cat.columns:
    print(col)
    print(df_cat[col].value_counts())
    print('')

RD_NO_x
JB311558    12
JF346755    12
JD248349    11
JA553317     9
JF492364     9
            ..
JE488934     1
JB170761     1
JA429965     1
JB170619     1
JB332398     1
Name: RD_NO_x, Length: 543184, dtype: int64

CRASH_DATE_EST_I
Y    48764
N     7414
Name: CRASH_DATE_EST_I, dtype: int64

CRASH_DATE_x
12/29/2020 05:00:00 PM    34
01/12/2019 02:00:00 PM    30
11/10/2017 10:30:00 AM    29
11/10/2017 10:00:00 AM    22
02/04/2021 01:30:00 PM    20
                          ..
07/26/2021 09:25:00 AM     1
11/02/2020 08:00:00 AM     1
12/16/2022 07:40:00 PM     1
11/01/2022 07:25:00 AM     1
04/12/2018 05:08:00 PM     1
Name: CRASH_DATE_x, Length: 386527, dtype: int64

TRAFFIC_CONTROL_DEVICE
NO CONTROLS                 423859
TRAFFIC SIGNAL              208803
STOP SIGN/FLASHER            75155
UNKNOWN                      27428
OTHER                         4725
LANE USE MARKING              1230
YIELD                         1109
OTHER REG. SIGN                737
RAILROAD CROSSING GA

In [224]:
df_cat_dropped_cols = df_cat.drop(columns=['RD_NO_x',
                'CRASH_DATE_EST_I', 
                'REPORT_TYPE', 
                'STREET_DIRECTION',
                'STREET_NAME',
                'LOCATION', 
                'RD_NO_y', 
                'CMRC_VEH_I', 
                'LIC_PLATE_STATE', 
                'TRAVEL_DIRECTION'
                 ])

In [225]:
df_cat_dropped_cols.isna().sum()

CRASH_DATE_x                    0
TRAFFIC_CONTROL_DEVICE          0
DEVICE_CONDITION                0
WEATHER_CONDITION               0
LIGHTING_CONDITION              0
FIRST_CRASH_TYPE                0
TRAFFICWAY_TYPE                 0
ALIGNMENT                       0
ROADWAY_SURFACE_COND            0
ROAD_DEFECT                     0
CRASH_TYPE                      0
INTERSECTION_RELATED_I     571980
NOT_RIGHT_OF_WAY_I         715536
HIT_AND_RUN_I              508950
DAMAGE                          0
DATE_POLICE_NOTIFIED            0
PRIM_CONTRIBUTORY_CAUSE         0
SEC_CONTRIBUTORY_CAUSE          0
MOST_SEVERE_INJURY           1290
CRASH_DATE_y                    0
UNIT_TYPE                    1005
MAKE                        16824
MODEL                       16900
VEHICLE_DEFECT              16823
VEHICLE_TYPE                16823
VEHICLE_USE                 16823
MANEUVER                    16823
EXCEED_SPEED_LIMIT_I       744715
FIRST_CONTACT_POINT         18389
dtype: int64

## Getting target variable

In [226]:
'''
remove
PRIM_CONTRIBUTORY_CAUSE
and
SEC_CONTRIBUTORY_CAUSE
from df_cat_dropped_cols
assign it to it's own dataframe called df_targets,
and then export df_targets to csv
'''

# Specifically I'm going to make prim cause the target and remove secondary, maybe I'll come back to it if I have time.
df_targets = df_cat_dropped_cols[['PRIM_CONTRIBUTORY_CAUSE']]
df_cat_dropped_cols.drop(columns=['PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE'], inplace=True)
df_targets.to_csv('data/df_targets.csv')

In [227]:
df_targets.isna().sum()

PRIM_CONTRIBUTORY_CAUSE    0
dtype: int64

In [228]:
df_targets.dtypes

PRIM_CONTRIBUTORY_CAUSE    object
dtype: object

## splitting into ordinal and nominal

In [229]:
'''
assign to df_cat_ordinal 
INTERSECTION_RELATED_I
NOT_RIGHT_OF_WAY_I
HIT_AND_RUN_I
and EXCEED_SPEED_LIMIT_I
'''
df_cat_ordinal = df_cat_dropped_cols[['INTERSECTION_RELATED_I',
                                        'NOT_RIGHT_OF_WAY_I',
                                        'HIT_AND_RUN_I',
                                        'EXCEED_SPEED_LIMIT_I'
                                        ]]

# assign the rest to nominal
df_cat_nominal = df_cat_dropped_cols.drop(columns=['INTERSECTION_RELATED_I',
                                        'NOT_RIGHT_OF_WAY_I',
                                        'HIT_AND_RUN_I',
                                        'EXCEED_SPEED_LIMIT_I'
                                        ])

### ordinal

In [230]:
# list nulls in ordinal
df_cat_ordinal.isna().sum()

INTERSECTION_RELATED_I    571980
NOT_RIGHT_OF_WAY_I        715536
HIT_AND_RUN_I             508950
EXCEED_SPEED_LIMIT_I      744715
dtype: int64

In [231]:
for col in df_cat_ordinal.columns:
    print(col)
    print(df_cat_ordinal[col].value_counts())
    print('')

INTERSECTION_RELATED_I
Y    165705
N      8212
Name: INTERSECTION_RELATED_I, dtype: int64

NOT_RIGHT_OF_WAY_I
Y    27469
N     2892
Name: NOT_RIGHT_OF_WAY_I, dtype: int64

HIT_AND_RUN_I
Y    226826
N     10121
Name: HIT_AND_RUN_I, dtype: int64

EXCEED_SPEED_LIMIT_I
Y    883
N    299
Name: EXCEED_SPEED_LIMIT_I, dtype: int64



In [232]:
# fill all nulls in ordinal with 'N'
df_cat_ordinal_filled = df_cat_ordinal.fillna('N')

In [233]:
df_cat_ordinal_filled.isna().sum()

INTERSECTION_RELATED_I    0
NOT_RIGHT_OF_WAY_I        0
HIT_AND_RUN_I             0
EXCEED_SPEED_LIMIT_I      0
dtype: int64

In [234]:
for col in df_cat_ordinal_filled.columns:
    print(col)
    print(df_cat_ordinal_filled[col].value_counts())
    print('')

INTERSECTION_RELATED_I
N    580192
Y    165705
Name: INTERSECTION_RELATED_I, dtype: int64

NOT_RIGHT_OF_WAY_I
N    718428
Y     27469
Name: NOT_RIGHT_OF_WAY_I, dtype: int64

HIT_AND_RUN_I
N    519071
Y    226826
Name: HIT_AND_RUN_I, dtype: int64

EXCEED_SPEED_LIMIT_I
N    745014
Y       883
Name: EXCEED_SPEED_LIMIT_I, dtype: int64



### nominal

In [235]:
# list nulls in nominal
df_cat_nominal.isna().sum()

CRASH_DATE_x                  0
TRAFFIC_CONTROL_DEVICE        0
DEVICE_CONDITION              0
WEATHER_CONDITION             0
LIGHTING_CONDITION            0
FIRST_CRASH_TYPE              0
TRAFFICWAY_TYPE               0
ALIGNMENT                     0
ROADWAY_SURFACE_COND          0
ROAD_DEFECT                   0
CRASH_TYPE                    0
DAMAGE                        0
DATE_POLICE_NOTIFIED          0
MOST_SEVERE_INJURY         1290
CRASH_DATE_y                  0
UNIT_TYPE                  1005
MAKE                      16824
MODEL                     16900
VEHICLE_DEFECT            16823
VEHICLE_TYPE              16823
VEHICLE_USE               16823
MANEUVER                  16823
FIRST_CONTACT_POINT       18389
dtype: int64

In [236]:
# assign the columnsn in df_cat_nominal with nulls to df_cat_nominal_null
df_cat_nominal_null = df_cat_nominal[['FIRST_CONTACT_POINT',
                                      'MANEUVER',
                                      'VEHICLE_USE',
                                      'VEHICLE_TYPE',
                                      'VEHICLE_DEFECT',
                                      'MODEL',
                                      'MAKE',
                                      'UNIT_TYPE',
                                      'MOST_SEVERE_INJURY']]

# assign the rest to df_cat_nominal_nonull
df_cat_nominal_nonull = df_cat_nominal.drop(columns=['FIRST_CONTACT_POINT',
                                                    'MANEUVER',
                                                    'VEHICLE_USE',
                                                    'VEHICLE_TYPE',
                                                    'VEHICLE_DEFECT',
                                                    'MODEL',
                                                    'MAKE',
                                                    'UNIT_TYPE',
                                                    'MOST_SEVERE_INJURY'])


In [237]:
# check that the split worked
df_cat_nominal_null.isna().sum()

FIRST_CONTACT_POINT    18389
MANEUVER               16823
VEHICLE_USE            16823
VEHICLE_TYPE           16823
VEHICLE_DEFECT         16823
MODEL                  16900
MAKE                   16824
UNIT_TYPE               1005
MOST_SEVERE_INJURY      1290
dtype: int64

In [238]:
df_cat_nominal_nonull.isna().sum()

CRASH_DATE_x              0
TRAFFIC_CONTROL_DEVICE    0
DEVICE_CONDITION          0
WEATHER_CONDITION         0
LIGHTING_CONDITION        0
FIRST_CRASH_TYPE          0
TRAFFICWAY_TYPE           0
ALIGNMENT                 0
ROADWAY_SURFACE_COND      0
ROAD_DEFECT               0
CRASH_TYPE                0
DAMAGE                    0
DATE_POLICE_NOTIFIED      0
CRASH_DATE_y              0
dtype: int64

In [239]:
# how many columns are in df_cat_nominal_null?
print(len(df_cat_nominal_null.columns))

# how many columns are in df_cat_nominal_nonull?
print(len(df_cat_nominal_nonull.columns))

9
14


In [240]:
print(9 + 15)

24


In [241]:
# how many columsn were in df_cat_nominal?
print(len(df_cat_nominal.columns))

23


In [242]:
for col in df_cat_nominal_null.columns:
    print(col)
    print(df_cat_nominal_null[col].value_counts())
    print('')

FIRST_CONTACT_POINT
FRONT                 141946
REAR                   96173
UNKNOWN                69102
SIDE-LEFT              49558
SIDE-RIGHT             47191
FRONT-LEFT             40952
FRONT-LEFT-CORNER      40655
FRONT-RIGHT-CORNER     40021
FRONT-RIGHT            38538
REAR-LEFT              34014
OTHER                  19855
REAR-RIGHT             18378
REAR-LEFT-CORNER       18124
REAR-RIGHT-CORNER      13243
TOTAL (ALL AREAS)      13232
SIDE-LEFT-REAR         10579
SIDE-RIGHT-REAR         7892
SIDE-LEFT-FRONT         6788
ROOF                    6040
NONE                    5923
SIDE-RIGHT-FRONT        5818
UNDER CARRIAGE          2687
TOP                      799
Name: FIRST_CONTACT_POINT, dtype: int64

MANEUVER
STRAIGHT AHEAD                        333272
PARKED                                100264
UNKNOWN/NA                             56437
SLOW/STOP IN TRAFFIC                   55501
TURNING LEFT                           43065
BACKING                               

In [243]:
'''Fill the following columns with 'UNKNOWN' in df_cat_nominal_null:
MOST_SEVERE_INJURY
UNIT_TYPE
MAKE
MODEL
VEHICLE_DEFECT
FIRST_CONTACT_POINT
'''

df_cat_nominal_null['MOST_SEVERE_INJURY'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['UNIT_TYPE'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['MAKE'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['MODEL'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['VEHICLE_DEFECT'].fillna('UNKNOWN', inplace=True)
df_cat_nominal_null['FIRST_CONTACT_POINT'].fillna('UNKNOWN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [244]:
'''Fill the following columns with 'UNKNOWN/NA' in df_cat_nominal_null:
VEHICLE_TYPE
VEHICLE_USE
MANUEVER
'''

df_cat_nominal_null['VEHICLE_TYPE'].fillna('UNKNOWN/NA', inplace=True)
df_cat_nominal_null['VEHICLE_USE'].fillna('UNKNOWN/NA', inplace=True)
df_cat_nominal_null['MANEUVER'].fillna('UNKNOWN/NA', inplace=True)

## Combining nominal and ordinal again

In [245]:
# combine df_cat_nominal_null and df_cat_nominal_nonull into df_cat_nominal_filled
df_cat_nominal_filled = pd.concat([df_cat_nominal_null, df_cat_nominal_nonull], axis=1)

# combine df_cat_nominal_filled and df_cat_ordinal_filled into df_cat_filled
df_cat_filled = pd.concat([df_cat_nominal_filled, df_cat_ordinal_filled], axis=1)

## Combining everything again

In [246]:
# Combine df_cat_filled and df_num into df_clean

df_clean = pd.concat([df_num, df_cat_filled], axis=1)

In [247]:
df_clean.head().T

Unnamed: 0,0,1,2,3,4
POSTED_SPEED_LIMIT,30,30,30,30,30
STREET_NO,2165,5810,4800,11764,4241
NUM_UNITS,4,2,2,4,2
INJURIES_TOTAL,0,0,0,1,0
INJURIES_FATAL,0,0,0,0,0
INJURIES_INCAPACITATING,0,0,0,0,0
INJURIES_NON_INCAPACITATING,0,0,0,0,0
INJURIES_REPORTED_NOT_EVIDENT,0,0,0,1,0
INJURIES_NO_INDICATION,5,2,2,6,2
INJURIES_UNKNOWN,0,0,0,0,0


# Exporting the cleaned data

In [248]:
# export df_clean to csv
df_clean.to_csv('data/df_clean.csv')

In [249]:
df_clean.shape

(745897, 47)

In [250]:
df_clean.isna().sum()

POSTED_SPEED_LIMIT               0
STREET_NO                        0
NUM_UNITS                        0
INJURIES_TOTAL                   0
INJURIES_FATAL                   0
INJURIES_INCAPACITATING          0
INJURIES_NON_INCAPACITATING      0
INJURIES_REPORTED_NOT_EVIDENT    0
INJURIES_NO_INDICATION           0
INJURIES_UNKNOWN                 0
CRASH_HOUR                       0
CRASH_DAY_OF_WEEK                0
CRASH_MONTH                      0
LATITUDE                         0
LONGITUDE                        0
CRASH_UNIT_ID                    0
UNIT_NO                          0
NUM_PASSENGERS                   0
VEHICLE_YEAR                     0
OCCUPANT_CNT                     0
FIRST_CONTACT_POINT              0
MANEUVER                         0
VEHICLE_USE                      0
VEHICLE_TYPE                     0
VEHICLE_DEFECT                   0
MODEL                            0
MAKE                             0
UNIT_TYPE                        0
MOST_SEVERE_INJURY  