In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# File to Load
accident_path = Path("../Datasets/ACCIDENT.csv")

In [3]:
# Read the CSV file
accident_data = pd.read_csv(accident_path, low_memory=False)

In [4]:
# Store it in a Dataframe
accident_data_df = pd.DataFrame(accident_data)
accident_data_df.head()

Unnamed: 0,ACCIDENT_NO,ACCIDENTDATE,ACCIDENTTIME,ACCIDENT_TYPE,Accident Type Desc,DAY_OF_WEEK,Day Week Description,DCA_CODE,DCA Description,DIRECTORY,...,NO_PERSONS,NO_PERSONS_INJ_2,NO_PERSONS_INJ_3,NO_PERSONS_KILLED,NO_PERSONS_NOT_INJ,POLICE_ATTEND,ROAD_GEOMETRY,Road Geometry Desc,SEVERITY,SPEED_ZONE
0,T20060000010,13/01/2006,12:42:00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,...,6,0,1,0,5,1,1,Cross intersection,3,60
1,T20060000018,13/01/2006,19:10:00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,...,4,0,1,0,3,1,2,T intersection,3,70
2,T20060000022,14/01/2006,12:10:00,7,Fall from or in moving vehicle,7,Saturday,190,FELL IN/FROM VEHICLE,MEL,...,2,1,0,0,1,1,5,Not at intersection,2,100
3,T20060000023,14/01/2006,11:49:00,1,Collision with vehicle,7,Saturday,130,REAR END(VEHICLES IN SAME LANE),MEL,...,2,1,0,0,1,1,2,T intersection,2,80
4,T20060000026,14/01/2006,10:45:00,1,Collision with vehicle,7,Saturday,121,RIGHT THROUGH,MEL,...,3,0,3,0,0,1,5,Not at intersection,3,50


In [5]:
# Get the total of unique accident data from accident table
unique_accident_data = len(accident_data["ACCIDENT_NO"].unique())
unique_accident_data

203708

In [6]:
# Checking the data types
column_data_types = accident_data_df.dtypes
column_data_types

ACCIDENT_NO             object
ACCIDENTDATE            object
ACCIDENTTIME            object
ACCIDENT_TYPE            int64
Accident Type Desc      object
DAY_OF_WEEK              int64
Day Week Description    object
DCA_CODE                 int64
DCA Description         object
DIRECTORY               object
EDITION                 object
PAGE                    object
GRID_REFERENCE_X        object
GRID_REFERENCE_Y        object
LIGHT_CONDITION          int64
Light Condition Desc    object
NODE_ID                  int64
NO_OF_VEHICLES           int64
NO_PERSONS               int64
NO_PERSONS_INJ_2         int64
NO_PERSONS_INJ_3         int64
NO_PERSONS_KILLED        int64
NO_PERSONS_NOT_INJ       int64
POLICE_ATTEND            int64
ROAD_GEOMETRY            int64
Road Geometry Desc      object
SEVERITY                 int64
SPEED_ZONE               int64
dtype: object

In [7]:
# Find columns with missing values
columns_with_missing_values = accident_data_df.isnull().any()

# Display the columns with missing values
print(columns_with_missing_values)

ACCIDENT_NO             False
ACCIDENTDATE            False
ACCIDENTTIME            False
ACCIDENT_TYPE           False
Accident Type Desc      False
DAY_OF_WEEK             False
Day Week Description    False
DCA_CODE                False
DCA Description         False
DIRECTORY                True
EDITION                  True
PAGE                     True
GRID_REFERENCE_X         True
GRID_REFERENCE_Y         True
LIGHT_CONDITION         False
Light Condition Desc    False
NODE_ID                 False
NO_OF_VEHICLES          False
NO_PERSONS              False
NO_PERSONS_INJ_2        False
NO_PERSONS_INJ_3        False
NO_PERSONS_KILLED       False
NO_PERSONS_NOT_INJ      False
POLICE_ATTEND           False
ROAD_GEOMETRY           False
Road Geometry Desc      False
SEVERITY                False
SPEED_ZONE              False
dtype: bool


In [8]:
# Drop columns with missing values
df_dropped = accident_data_df.drop(columns=columns_with_missing_values[columns_with_missing_values].index, inplace=True)
df_dropped

In [9]:
# Checking the data types
column_data_types = accident_data_df.dtypes
column_data_types

ACCIDENT_NO             object
ACCIDENTDATE            object
ACCIDENTTIME            object
ACCIDENT_TYPE            int64
Accident Type Desc      object
DAY_OF_WEEK              int64
Day Week Description    object
DCA_CODE                 int64
DCA Description         object
LIGHT_CONDITION          int64
Light Condition Desc    object
NODE_ID                  int64
NO_OF_VEHICLES           int64
NO_PERSONS               int64
NO_PERSONS_INJ_2         int64
NO_PERSONS_INJ_3         int64
NO_PERSONS_KILLED        int64
NO_PERSONS_NOT_INJ       int64
POLICE_ATTEND            int64
ROAD_GEOMETRY            int64
Road Geometry Desc      object
SEVERITY                 int64
SPEED_ZONE               int64
dtype: object

In [10]:
# Drop node_id column as will merge table with Node table later
columns_to_drop = ['NODE_ID']

accident_data_df = accident_data_df.drop(columns=columns_to_drop)

# Checking the data types
column_data_types = accident_data_df.dtypes
column_data_types

ACCIDENT_NO             object
ACCIDENTDATE            object
ACCIDENTTIME            object
ACCIDENT_TYPE            int64
Accident Type Desc      object
DAY_OF_WEEK              int64
Day Week Description    object
DCA_CODE                 int64
DCA Description         object
LIGHT_CONDITION          int64
Light Condition Desc    object
NO_OF_VEHICLES           int64
NO_PERSONS               int64
NO_PERSONS_INJ_2         int64
NO_PERSONS_INJ_3         int64
NO_PERSONS_KILLED        int64
NO_PERSONS_NOT_INJ       int64
POLICE_ATTEND            int64
ROAD_GEOMETRY            int64
Road Geometry Desc      object
SEVERITY                 int64
SPEED_ZONE               int64
dtype: object

In [11]:
# Storing file in Data folder
cleaned_data_output_path = Path("../Cleaned_Datasets/accident_cleaned.csv")
accident_data_df.to_csv(cleaned_data_output_path, index=None, header=True)