In [1]:
import pandas as pd
import numpy as np 
import csv

from pltfunctions import hist_kde_plots
from math import sqrt

import matplotlib.pyplot as plt
import seaborn as sns

from edafunctions import df_remove_columns_threshold as rmcol
from edafunctions import df_merge_dataframes_left as merle

## Basic Data Import and Cleaning

In [2]:
dfvehicles = pd.read_csv(r"data/TrafficCrashes-Vehicle.csv", low_memory=False)

In [12]:
dfvehicles = rmcol(dfvehicles)

In [13]:
dfvehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925738 entries, 0 to 925737
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CRASH_UNIT_ID        925738 non-null  int64  
 1   CRASH_RECORD_ID      925738 non-null  object 
 2   RD_NO                918507 non-null  object 
 3   CRASH_DATE           925738 non-null  object 
 4   UNIT_NO              925738 non-null  int64  
 5   UNIT_TYPE            924349 non-null  object 
 6   VEHICLE_ID           904074 non-null  float64
 7   MAKE                 904069 non-null  object 
 8   MODEL                903927 non-null  object 
 9   VEHICLE_DEFECT       904074 non-null  object 
 10  VEHICLE_TYPE         904074 non-null  object 
 11  VEHICLE_USE          904074 non-null  object 
 12  TRAVEL_DIRECTION     904074 non-null  object 
 13  MANEUVER             904074 non-null  object 
 14  OCCUPANT_CNT         904074 non-null  float64
 15  FIRST_CONTACT_POI

In [14]:
dfcrash = pd.read_csv(r"data/TrafficCrashes-Crashes.csv", low_memory=False)

In [15]:
dfcrash = rmcol(dfcrash)

In [16]:
dfcrash.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453873 entries, 0 to 453872
Data columns (total 38 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                453873 non-null  object 
 1   RD_NO                          450376 non-null  object 
 2   CRASH_DATE                     453873 non-null  object 
 3   POSTED_SPEED_LIMIT             453873 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         453873 non-null  object 
 5   DEVICE_CONDITION               453873 non-null  object 
 6   WEATHER_CONDITION              453873 non-null  object 
 7   LIGHTING_CONDITION             453873 non-null  object 
 8   FIRST_CRASH_TYPE               453873 non-null  object 
 9   TRAFFICWAY_TYPE                453873 non-null  object 
 10  ALIGNMENT                      453873 non-null  object 
 11  ROADWAY_SURFACE_COND           453873 non-null  object 
 12  ROAD_DEFECT                   

In [17]:
dfpeople = pd.read_csv(r"data/TrafficCrashes-People.csv", low_memory=False)

In [18]:
dfpeople = rmcol(dfpeople)

In [19]:
dfpeople.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006093 entries, 0 to 1006092
Data columns (total 11 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   PERSON_ID              1006093 non-null  object 
 1   PERSON_TYPE            1006093 non-null  object 
 2   CRASH_RECORD_ID        1006093 non-null  object 
 3   RD_NO                  998607 non-null   object 
 4   VEHICLE_ID             985919 non-null   float64
 5   CRASH_DATE             1006093 non-null  object 
 6   SEX                    991169 non-null   object 
 7   SAFETY_EQUIPMENT       1003090 non-null  object 
 8   AIRBAG_DEPLOYED        986732 non-null   object 
 9   EJECTION               993588 non-null   object 
 10  INJURY_CLASSIFICATION  1005547 non-null  object 
dtypes: float64(1), object(10)
memory usage: 84.4+ MB


## Create a merged data table on CRASH_RECORD_ID

In [20]:
merge = 'CRASH_RECORD_ID'

In [21]:
dfmerge = pd.merge(dfvehicles, dfcrash, how='left', on=merge)

In [22]:
dfmerge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925738 entries, 0 to 925737
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_UNIT_ID                  925738 non-null  int64  
 1   CRASH_RECORD_ID                925738 non-null  object 
 2   RD_NO_x                        918507 non-null  object 
 3   CRASH_DATE_x                   925738 non-null  object 
 4   UNIT_NO                        925738 non-null  int64  
 5   UNIT_TYPE                      924349 non-null  object 
 6   VEHICLE_ID                     904074 non-null  float64
 7   MAKE                           904069 non-null  object 
 8   MODEL                          903927 non-null  object 
 9   VEHICLE_DEFECT                 904074 non-null  object 
 10  VEHICLE_TYPE                   904074 non-null  object 
 11  VEHICLE_USE                    904074 non-null  object 
 12  TRAVEL_DIRECTION              

In [23]:
dfmerged = pd.merge(dfmerge, dfpeople, how='left', on=merge)

In [24]:
dfmerged.describe()

Unnamed: 0,CRASH_UNIT_ID,UNIT_NO,VEHICLE_ID_x,OCCUPANT_CNT,POSTED_SPEED_LIMIT,STREET_NO,BEAT_OF_OCCURRENCE,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,VEHICLE_ID_y
count,2115954.0,2115954.0,2065023.0,2065023.0,2115954.0,2115954.0,2115930.0,2115954.0,2114418.0,2114418.0,...,2114418.0,2114418.0,2114418.0,2114418.0,2115954.0,2115954.0,2115954.0,2104365.0,2104365.0,2069724.0
mean,497805.8,3.374134,474122.0,1.388109,28.83798,3578.908,1234.728,2.187497,0.2935285,0.001633073,...,0.1623671,0.09884469,2.675287,0.0,13.4663,4.145678,6.770168,41.85315,-87.66732,474186.1
std,286045.8,2597.244,269955.3,1.404195,5.998828,2883.237,705.1333,0.6244938,0.8312822,0.04724396,...,0.6184463,0.4581869,2.067176,0.0,5.381389,1.98384,3.372745,0.4754937,0.9818589,269896.7
min,2.0,0.0,2.0,0.0,0.0,0.0,111.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-87.93401,2.0
25%,249678.0,1.0,241934.0,1.0,30.0,1159.0,712.0,2.0,0.0,0.0,...,0.0,0.0,2.0,0.0,10.0,2.0,4.0,41.78577,-87.7214,242058.0
50%,498479.0,2.0,475513.0,1.0,30.0,3101.0,1211.0,2.0,0.0,0.0,...,0.0,0.0,2.0,0.0,14.0,4.0,7.0,41.87612,-87.67328,475520.0
75%,746641.0,2.0,708355.5,2.0,30.0,5500.0,1821.0,2.0,0.0,0.0,...,0.0,0.0,3.0,0.0,17.0,6.0,10.0,41.92352,-87.63263,708376.0
max,990691.0,3778035.0,938835.0,99.0,99.0,451100.0,6100.0,18.0,21.0,4.0,...,21.0,15.0,61.0,0.0,23.0,7.0,12.0,42.02278,0.0,938827.0


In [25]:
dfmerged = dfmerged.dropna() # because of amount of data, am going to remove all rows with an NaN value present


In [26]:
dfmerged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897113 entries, 0 to 2115933
Data columns (total 63 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH_UNIT_ID                  int64  
 1   CRASH_RECORD_ID                object 
 2   RD_NO_x                        object 
 3   CRASH_DATE_x                   object 
 4   UNIT_NO                        int64  
 5   UNIT_TYPE                      object 
 6   VEHICLE_ID_x                   float64
 7   MAKE                           object 
 8   MODEL                          object 
 9   VEHICLE_DEFECT                 object 
 10  VEHICLE_TYPE                   object 
 11  VEHICLE_USE                    object 
 12  TRAVEL_DIRECTION               object 
 13  MANEUVER                       object 
 14  OCCUPANT_CNT                   float64
 15  FIRST_CONTACT_POINT            object 
 16  RD_NO_y                        object 
 17  CRASH_DATE_y                   object 
 18  PO

In [27]:
# what other columns can be dropped right away? 
# drop_columns = ['RD_NO_x, TRAVEL_DIRECTION, RD_NO_y, DATE_POLICE_NOTIFIED, STREET_NAME, STREET_DIRECTION, STREET_NO, CRASH_RECORD_ID, CRASH_UNIT_ID, UNIT_NO, VEHICLE_ID_x, FIRST_CONTACT_POINT, LOCATION']
dfmerged = dfmerged.drop(columns=['CRASH_RECORD_ID', 'RD_NO_x', 'TRAVEL_DIRECTION', 'RD_NO_y', 'DATE_POLICE_NOTIFIED', 'STREET_NAME', 'STREET_DIRECTION', 'STREET_NO', 'CRASH_RECORD_ID', 'CRASH_UNIT_ID', 'UNIT_NO', 'VEHICLE_ID_x', 'FIRST_CONTACT_POINT', 'LOCATION'])

In [28]:
dfmerged = dfmerged.drop(columns=['RD_NO', 'VEHICLE_ID_y'])

In [29]:
dfmerged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897113 entries, 0 to 2115933
Data columns (total 48 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH_DATE_x                   object 
 1   UNIT_TYPE                      object 
 2   MAKE                           object 
 3   MODEL                          object 
 4   VEHICLE_DEFECT                 object 
 5   VEHICLE_TYPE                   object 
 6   VEHICLE_USE                    object 
 7   MANEUVER                       object 
 8   OCCUPANT_CNT                   float64
 9   CRASH_DATE_y                   object 
 10  POSTED_SPEED_LIMIT             int64  
 11  TRAFFIC_CONTROL_DEVICE         object 
 12  DEVICE_CONDITION               object 
 13  WEATHER_CONDITION              object 
 14  LIGHTING_CONDITION             object 
 15  FIRST_CRASH_TYPE               object 
 16  TRAFFICWAY_TYPE                object 
 17  ALIGNMENT                      object 
 18  RO

In [30]:
dfmerged['CRASH_TYPE'].unique() # multicollinearity with most severe injury column

array(['NO INJURY / DRIVE AWAY', 'INJURY AND / OR TOW DUE TO CRASH'],
      dtype=object)

In [31]:
dfmerged['MOST_SEVERE_INJURY'].unique() # target classification column, INJURIES_FATAL column has the total number of fatalities in a given accident - some of our rows are duplicates - how to remove

array(['NO INDICATION OF INJURY', 'NONINCAPACITATING INJURY',
       'REPORTED, NOT EVIDENT', 'INCAPACITATING INJURY', 'FATAL'],
      dtype=object)

In [32]:
dfmerged['OCCUPANT_CNT'].unique() # occupant count

array([ 1.,  0.,  2.,  3.,  5.,  4., 37.,  6.,  8.,  9., 13.,  7., 35.,
       26., 20., 16., 15., 14., 12., 44., 18., 22., 36., 11., 10., 19.,
       30., 33., 24., 43., 60., 34., 17., 39., 25., 27., 21., 29., 41.,
       28., 47., 38., 99.])

In [33]:
dfmerged['PERSON_TYPE'].unique()

array(['DRIVER', 'PASSENGER', 'NON-CONTACT VEHICLE'], dtype=object)

In [34]:
dfmerged['INJURIES_FATAL'].unique()

array([0., 1., 2., 3.])

In [35]:
dfmerged['INJURY_CLASSIFICATION'].unique() # gives injury on a per individual basis

array(['NO INDICATION OF INJURY', 'NONINCAPACITATING INJURY',
       'REPORTED, NOT EVIDENT', 'INCAPACITATING INJURY', 'FATAL'],
      dtype=object)

In [36]:
dfmerged['MANEUVER'].unique()

array(['TURNING LEFT', 'STRAIGHT AHEAD', 'SLOW/STOP IN TRAFFIC',
       'UNKNOWN/NA', 'CHANGING LANES', 'PARKED', 'PASSING/OVERTAKING',
       'MERGING', 'BACKING', 'STARTING IN TRAFFIC', 'OTHER',
       'AVOIDING VEHICLES/OBJECTS', 'SLOW/STOP - LOAD/UNLOAD',
       'SKIDDING/CONTROL LOSS', 'NEGOTIATING A CURVE', 'TURNING RIGHT',
       'ENTER FROM DRIVE/ALLEY', 'U-TURN', 'PARKED IN TRAFFIC LANE',
       'LEAVING TRAFFIC LANE TO PARK', 'SLOW/STOP - LEFT TURN',
       'ENTERING TRAFFIC LANE FROM PARKING', 'DRIVERLESS',
       'SLOW/STOP - RIGHT TURN', 'DIVERGING', 'TURNING ON RED',
       'DRIVING WRONG WAY', 'DISABLED'], dtype=object)

In [37]:
dfmerged['PRIM_CONTRIBUTORY_CAUSE'].unique()

array(['UNABLE TO DETERMINE', 'FOLLOWING TOO CLOSELY',
       'FAILING TO YIELD RIGHT-OF-WAY', 'IMPROPER LANE USAGE',
       'IMPROPER OVERTAKING/PASSING', 'NOT APPLICABLE',
       'IMPROPER BACKING', 'FAILING TO REDUCE SPEED TO AVOID CRASH',
       'DISTRACTION - FROM INSIDE VEHICLE', 'WEATHER',
       'DISREGARDING STOP SIGN', 'PHYSICAL CONDITION OF DRIVER',
       'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)',
       'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE',
       'IMPROPER TURNING/NO SIGNAL',
       'EXCEEDING SAFE SPEED FOR CONDITIONS',
       'EQUIPMENT - VEHICLE CONDITION', 'DRIVING ON WRONG SIDE/WRONG WAY',
       'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER',
       'EXCEEDING AUTHORIZED SPEED LIMIT', 'DISREGARDING TRAFFIC SIGNALS',
       'DISREGARDING ROAD MARKINGS',
       'ROAD ENGINEERING/SURFACE/MARKING DEFECTS',
       'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST', 'TEXTING',
       'UNDER THE INFLUENCE OF ALCOHOL/DRUGS 

In [38]:
dfmerged['SEC_CONTRIBUTORY_CAUSE'].unique() 

array(['UNABLE TO DETERMINE', 'NOT APPLICABLE',
       'FAILING TO REDUCE SPEED TO AVOID CRASH',
       'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE', 'IMPROPER LANE USAGE',
       'FOLLOWING TOO CLOSELY',
       'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)',
       'IMPROPER OVERTAKING/PASSING', 'FAILING TO YIELD RIGHT-OF-WAY',
       'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER',
       'DRIVING ON WRONG SIDE/WRONG WAY', 'WEATHER',
       'EXCEEDING SAFE SPEED FOR CONDITIONS',
       'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT',
       'IMPROPER TURNING/NO SIGNAL', 'EQUIPMENT - VEHICLE CONDITION',
       'DISREGARDING OTHER TRAFFIC SIGNS',
       'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)',
       'ROAD ENGINEERING/SURFACE/MARKING DEFECTS',
       'DISREGARDING TRAFFIC SIGNALS', 'EXCEEDING AUTHORIZED SPEED LIMIT',
       'CELL PHONE USE OTHER THAN TEXTING', 'IMPROPER BACKING',
       'PHYSICAL CONDITION OF DRIVER', 'TEXTING',
       'DISTRAC

In [39]:
# There are a few more things to do to clean data
# 1) I want to reduce amount of data - I only want crash data from year 2018 onwards so I will filter for that
# 2) Remove rows that have 'Unkown' or 'Other' in the VEHICLE_DEFECT column
# 3) Remove Unknown/NA from VEHICLE_TYPE, VEHICLE_USE, MANEUVER, 
# 4) Remove any rows with POSTED_SPEED_LIMIT less than 15 mph
# 5) Remove Unknown from TRAFFIC_CONTROL_DEVICE
# 6) Remove Unknown from DEVICE_CONDITION
# 7) Remove longitude/latitude coordinates outside of Chicago area

In [40]:
dfmerged['CRASH_DATE_x'] = pd.to_datetime(dfmerged.CRASH_DATE_x)
dfmerged['CRASH_DATE_x'] = pd.DatetimeIndex(dfmerged['CRASH_DATE_x']).year

In [41]:
dfmerged.head()

Unnamed: 0,CRASH_DATE_x,UNIT_TYPE,MAKE,MODEL,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,MANEUVER,OCCUPANT_CNT,CRASH_DATE_y,...,LATITUDE,LONGITUDE,PERSON_ID,PERSON_TYPE,CRASH_DATE,SEX,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,EJECTION,INJURY_CLASSIFICATION
0,2015,DRIVER,FORD,Focus,NONE,PASSENGER,PERSONAL,TURNING LEFT,1.0,08/04/2015 12:40:00 PM,...,41.903416,-87.656037,O10,DRIVER,08/04/2015 12:40:00 PM,M,USAGE UNKNOWN,NOT APPLICABLE,NONE,NO INDICATION OF INJURY
1,2015,DRIVER,FORD,Focus,NONE,PASSENGER,PERSONAL,TURNING LEFT,1.0,08/04/2015 12:40:00 PM,...,41.903416,-87.656037,O11,DRIVER,08/04/2015 12:40:00 PM,M,SAFETY BELT USED,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY
2,2015,DRIVER,NISSAN,Pathfinder,NONE,SPORT UTILITY VEHICLE (SUV),PERSONAL,STRAIGHT AHEAD,1.0,07/31/2015 05:50:00 PM,...,41.848588,-87.618689,O100,DRIVER,07/31/2015 05:50:00 PM,M,SAFETY BELT USED,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY
3,2015,DRIVER,NISSAN,Pathfinder,NONE,SPORT UTILITY VEHICLE (SUV),PERSONAL,STRAIGHT AHEAD,1.0,07/31/2015 05:50:00 PM,...,41.848588,-87.618689,O101,DRIVER,07/31/2015 05:50:00 PM,M,SAFETY BELT USED,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY
4,2015,DRIVER,FORD,F150,UNKNOWN,VAN/MINI-VAN,UNKNOWN/NA,STRAIGHT AHEAD,1.0,09/02/2015 11:45:00 AM,...,41.904034,-87.629923,O1000,DRIVER,09/02/2015 11:45:00 AM,M,USAGE UNKNOWN,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY


In [42]:
df_recent = dfmerged[dfmerged.CRASH_DATE_x >= 2015]

In [43]:
df_recent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1897085 entries, 0 to 2115933
Data columns (total 48 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH_DATE_x                   int64  
 1   UNIT_TYPE                      object 
 2   MAKE                           object 
 3   MODEL                          object 
 4   VEHICLE_DEFECT                 object 
 5   VEHICLE_TYPE                   object 
 6   VEHICLE_USE                    object 
 7   MANEUVER                       object 
 8   OCCUPANT_CNT                   float64
 9   CRASH_DATE_y                   object 
 10  POSTED_SPEED_LIMIT             int64  
 11  TRAFFIC_CONTROL_DEVICE         object 
 12  DEVICE_CONDITION               object 
 13  WEATHER_CONDITION              object 
 14  LIGHTING_CONDITION             object 
 15  FIRST_CRASH_TYPE               object 
 16  TRAFFICWAY_TYPE                object 
 17  ALIGNMENT                      object 
 18  RO

In [44]:
df_recent.describe()

Unnamed: 0,CRASH_DATE_x,OCCUPANT_CNT,POSTED_SPEED_LIMIT,BEAT_OF_OCCURRENCE,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE
count,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0,1897085.0
mean,2018.135,1.368861,28.82813,1235.037,2.170257,0.2547023,0.0008270584,0.02334424,0.1379696,0.09256148,2.669634,0.0,13.45307,4.143755,6.731673,41.85544,-87.67227
std,1.293782,1.296735,6.01805,705.8126,0.5806255,0.783813,0.03290224,0.1970752,0.585617,0.4391018,1.93412,0.0,5.346856,1.981053,3.382254,0.3605597,0.7364289
min,2015.0,0.0,0.0,111.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-87.93401
25%,2017.0,1.0,30.0,713.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,10.0,2.0,4.0,41.78567,-87.72159
50%,2018.0,1.0,30.0,1211.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,4.0,7.0,41.87595,-87.67357
75%,2019.0,1.0,30.0,1822.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,17.0,6.0,10.0,41.9237,-87.63263
max,2020.0,99.0,99.0,2535.0,18.0,21.0,3.0,6.0,21.0,15.0,61.0,0.0,23.0,7.0,12.0,42.02278,0.0


In [45]:
df1 = df_recent[df_recent['VEHICLE_DEFECT'] != 'UNKNOWN']
df1 = df1[df1['VEHICLE_DEFECT'] != 'OTHER']
df1 = df1[df1['VEHICLE_TYPE'] != 'UNKNOWN/NA']
df1 = df1[df1['TRAFFIC_CONTROL_DEVICE'] != 'UNKNOWN']
df1 = df1[df1['DEVICE_CONDITION'] != 'UNKNOWN']

In [46]:
df1['VEHICLE_DEFECT'].unique()

array(['NONE', 'BRAKES', 'TIRES', 'ENGINE/MOTOR', 'FUEL SYSTEM', 'WHEELS',
       'STEERING', 'LIGHTS', 'WINDOWS', 'RESTRAINT SYSTEM', 'CARGO',
       'SUSPENSION', 'SIGNALS', 'EXHAUST', 'TRAILER COUPLING'],
      dtype=object)

In [47]:
df1 = df1[df1['LONGITUDE'] != 0]
df1 = df1[df1['LATITUDE'] != 0]

In [48]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1134909 entries, 0 to 2115933
Data columns (total 48 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   CRASH_DATE_x                   1134909 non-null  int64  
 1   UNIT_TYPE                      1134909 non-null  object 
 2   MAKE                           1134909 non-null  object 
 3   MODEL                          1134909 non-null  object 
 4   VEHICLE_DEFECT                 1134909 non-null  object 
 5   VEHICLE_TYPE                   1134909 non-null  object 
 6   VEHICLE_USE                    1134909 non-null  object 
 7   MANEUVER                       1134909 non-null  object 
 8   OCCUPANT_CNT                   1134909 non-null  float64
 9   CRASH_DATE_y                   1134909 non-null  object 
 10  POSTED_SPEED_LIMIT             1134909 non-null  int64  
 11  TRAFFIC_CONTROL_DEVICE         1134909 non-null  object 
 12  DEVICE_CONDITI

In [49]:
df2 = df1.sample(frac=0.5)
df2.to_csv('ChicagoCrashes.csv')
Sample1 = df1.sample(frac=0.2)
Sample2 = df1.sample(frac=0.2)
Sample3 = df1.sample(frac=0.2)

Sample1.to_csv('Sample1.csv')
Sample2.to_csv('Sample2.csv')
Sample3.to_csv('Sample3.csv')

In [50]:
Sample1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226982 entries, 1927557 to 1066360
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_DATE_x                   226982 non-null  int64  
 1   UNIT_TYPE                      226982 non-null  object 
 2   MAKE                           226982 non-null  object 
 3   MODEL                          226982 non-null  object 
 4   VEHICLE_DEFECT                 226982 non-null  object 
 5   VEHICLE_TYPE                   226982 non-null  object 
 6   VEHICLE_USE                    226982 non-null  object 
 7   MANEUVER                       226982 non-null  object 
 8   OCCUPANT_CNT                   226982 non-null  float64
 9   CRASH_DATE_y                   226982 non-null  object 
 10  POSTED_SPEED_LIMIT             226982 non-null  int64  
 11  TRAFFIC_CONTROL_DEVICE         226982 non-null  object 
 12  DEVICE_CONDITION       