In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# File to Load
vehicle_path = Path("../Datasets/VEHICLE.csv")

In [3]:
# Read the CSV file
vehicle_data = pd.read_csv(vehicle_path, low_memory=False)

In [4]:
# Store it in a Dataframe
vehicle_data_df = pd.DataFrame(vehicle_data)
vehicle_data_df.head()

Unnamed: 0,ACCIDENT_NO,VEHICLE_ID,VEHICLE_YEAR_MANUF,VEHICLE_DCA_CODE,INITIAL_DIRECTION,ROAD_SURFACE_TYPE,Road Surface Type Desc,REG_STATE,VEHICLE_BODY_STYLE,VEHICLE_MAKE,...,VEHICLE_COLOUR_1,VEHICLE_COLOUR_2,CAUGHT_FIRE,INITIAL_IMPACT,LAMPS,LEVEL_OF_DAMAGE,OWNER_POSTCODE,TOWED_AWAY_FLAG,TRAFFIC_CONTROL,Traffic Control Desc
0,T20060000010,A,1996.0,2,SW,1,Paved,V,SEDAN,MITSUB,...,MRN,ZZ,2,F,2,3,3130.0,1,1,Stop-go lights
1,T20060000010,B,2003.0,1,NW,1,Paved,V,COUPE,UNKN,...,BLU,ZZ,2,2,2,3,3977.0,1,1,Stop-go lights
2,T20060000010,C,2001.0,8,NW,1,Paved,V,SEDAN,FORD,...,YLW,ZZ,2,F,2,2,3804.0,2,1,Stop-go lights
3,T20060000018,A,1998.0,2,S,1,Paved,V,DC UTE,TOYOTA,...,GRY,ZZ,2,9,0,3,3175.0,1,0,No control
4,T20060000018,B,1991.0,1,N,1,Paved,V,SEDAN,SUBARU,...,BLU,ZZ,2,F,0,5,3805.0,1,11,Giveway sign


In [5]:
# Get the total of unique accident data from vehicle table
unique_vehicle_data = len(vehicle_data["ACCIDENT_NO"].unique())
unique_vehicle_data

203708

In [6]:
# Checking the data types
column_data_types = vehicle_data_df.dtypes
column_data_types

ACCIDENT_NO                object
VEHICLE_ID                 object
VEHICLE_YEAR_MANUF        float64
VEHICLE_DCA_CODE           object
INITIAL_DIRECTION          object
ROAD_SURFACE_TYPE           int64
Road Surface Type Desc     object
REG_STATE                  object
VEHICLE_BODY_STYLE         object
VEHICLE_MAKE               object
VEHICLE_MODEL              object
VEHICLE_POWER             float64
VEHICLE_TYPE                int64
Vehicle Type Desc          object
VEHICLE_WEIGHT            float64
CONSTRUCTION_TYPE          object
FUEL_TYPE                  object
NO_OF_WHEELS              float64
NO_OF_CYLINDERS           float64
SEATING_CAPACITY          float64
TARE_WEIGHT               float64
TOTAL_NO_OCCUPANTS        float64
CARRY_CAPACITY            float64
CUBIC_CAPACITY             object
FINAL_DIRECTION            object
DRIVER_INTENT              object
VEHICLE_MOVEMENT           object
TRAILER_TYPE               object
VEHICLE_COLOUR_1           object
VEHICLE_COLOUR

In [7]:
# Find columns with missing values
columns_with_missing_values = vehicle_data_df.isnull().any()

# Display the columns with missing values
columns_with_missing_values

ACCIDENT_NO               False
VEHICLE_ID                False
VEHICLE_YEAR_MANUF         True
VEHICLE_DCA_CODE          False
INITIAL_DIRECTION         False
ROAD_SURFACE_TYPE         False
Road Surface Type Desc    False
REG_STATE                 False
VEHICLE_BODY_STYLE        False
VEHICLE_MAKE              False
VEHICLE_MODEL             False
VEHICLE_POWER              True
VEHICLE_TYPE              False
Vehicle Type Desc         False
VEHICLE_WEIGHT             True
CONSTRUCTION_TYPE         False
FUEL_TYPE                 False
NO_OF_WHEELS               True
NO_OF_CYLINDERS            True
SEATING_CAPACITY           True
TARE_WEIGHT                True
TOTAL_NO_OCCUPANTS         True
CARRY_CAPACITY             True
CUBIC_CAPACITY             True
FINAL_DIRECTION           False
DRIVER_INTENT             False
VEHICLE_MOVEMENT          False
TRAILER_TYPE              False
VEHICLE_COLOUR_1          False
VEHICLE_COLOUR_2          False
CAUGHT_FIRE               False
INITIAL_

In [8]:
# Create a new column 'ACCIDENT_VEHICLE_ID' by iterating between 'ACCIDENT_NO' and 'VEHICLE_ID'
new_column_data = []

for index, row in vehicle_data_df.iterrows():
    new_value = row['ACCIDENT_NO'] + row['VEHICLE_ID']
    new_column_data.append(new_value)

# Insert 'ACCIDENT_VEHICLE_ID' at column 3 in the DataFrame
vehicle_data_df.insert(2, 'ACCIDENT_VEHICLE_ID', new_column_data)

# Display the resulting DataFrame
vehicle_data_df.head(10)

Unnamed: 0,ACCIDENT_NO,VEHICLE_ID,ACCIDENT_VEHICLE_ID,VEHICLE_YEAR_MANUF,VEHICLE_DCA_CODE,INITIAL_DIRECTION,ROAD_SURFACE_TYPE,Road Surface Type Desc,REG_STATE,VEHICLE_BODY_STYLE,...,VEHICLE_COLOUR_1,VEHICLE_COLOUR_2,CAUGHT_FIRE,INITIAL_IMPACT,LAMPS,LEVEL_OF_DAMAGE,OWNER_POSTCODE,TOWED_AWAY_FLAG,TRAFFIC_CONTROL,Traffic Control Desc
0,T20060000010,A,T20060000010A,1996.0,2,SW,1,Paved,V,SEDAN,...,MRN,ZZ,2,F,2,3,3130.0,1,1,Stop-go lights
1,T20060000010,B,T20060000010B,2003.0,1,NW,1,Paved,V,COUPE,...,BLU,ZZ,2,2,2,3,3977.0,1,1,Stop-go lights
2,T20060000010,C,T20060000010C,2001.0,8,NW,1,Paved,V,SEDAN,...,YLW,ZZ,2,F,2,2,3804.0,2,1,Stop-go lights
3,T20060000018,A,T20060000018A,1998.0,2,S,1,Paved,V,DC UTE,...,GRY,ZZ,2,9,0,3,3175.0,1,0,No control
4,T20060000018,B,T20060000018B,1991.0,1,N,1,Paved,V,SEDAN,...,BLU,ZZ,2,F,0,5,3805.0,1,11,Giveway sign
5,T20060000022,A,T20060000022A,1995.0,1,E,1,Paved,V,SOLO,...,BLK,ZZ,2,9,2,6,3806.0,2,0,No control
6,T20060000023,A,T20060000023A,1997.0,1,S,1,Paved,V,SEDAN,...,BLU,ZZ,2,F,0,5,3016.0,1,1,Stop-go lights
7,T20060000023,B,T20060000023B,2004.0,2,S,1,Paved,V,T TRK,...,YLW,ZZ,2,R,0,1,3198.0,1,1,Stop-go lights
8,T20060000026,A,T20060000026A,1986.0,1,S,1,Paved,V,VAN,...,WHI,ZZ,2,9,2,3,3940.0,1,0,No control
9,T20060000026,B,T20060000026B,1996.0,2,N,1,Paved,V,S WAG,...,RED,ZZ,2,9,2,3,3011.0,1,0,No control


In [9]:
# Create a dictionary to specify the column name changes
new_column_names = {'Road Surface Type Desc': 'Road_Surface_Type_Desc',
                    'Traffic Control Desc': 'Traffic_Control_Desc'
                   }

# Use the rename method to rename columns
vehicle_data_df.rename(columns=new_column_names, inplace=True)

# Display the DataFrame with renamed columns
vehicle_data_df

Unnamed: 0,ACCIDENT_NO,VEHICLE_ID,ACCIDENT_VEHICLE_ID,VEHICLE_YEAR_MANUF,VEHICLE_DCA_CODE,INITIAL_DIRECTION,ROAD_SURFACE_TYPE,Road_Surface_Type_Desc,REG_STATE,VEHICLE_BODY_STYLE,...,VEHICLE_COLOUR_1,VEHICLE_COLOUR_2,CAUGHT_FIRE,INITIAL_IMPACT,LAMPS,LEVEL_OF_DAMAGE,OWNER_POSTCODE,TOWED_AWAY_FLAG,TRAFFIC_CONTROL,Traffic_Control_Desc
0,T20060000010,A,T20060000010A,1996.0,2,SW,1,Paved,V,SEDAN,...,MRN,ZZ,2,F,2,3,3130.0,1,01,Stop-go lights
1,T20060000010,B,T20060000010B,2003.0,1,NW,1,Paved,V,COUPE,...,BLU,ZZ,2,2,2,3,3977.0,1,01,Stop-go lights
2,T20060000010,C,T20060000010C,2001.0,8,NW,1,Paved,V,SEDAN,...,YLW,ZZ,2,F,2,2,3804.0,2,01,Stop-go lights
3,T20060000018,A,T20060000018A,1998.0,2,S,1,Paved,V,DC UTE,...,GRY,ZZ,2,9,0,3,3175.0,1,00,No control
4,T20060000018,B,T20060000018B,1991.0,1,N,1,Paved,V,SEDAN,...,BLU,ZZ,2,F,0,5,3805.0,1,11,Giveway sign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365237,T20200019247,A,T20200019247A,2006.0,1,S,3,Gravel,V,SEDAN,...,BLU,ZZ,2,F,2,5,3224.0,1,00,No control
365238,T20200019250,A,T20200019250A,2007.0,1,NE,1,Paved,V,SEDAN,...,BLU,ZZ,2,1,2,1,3072.0,2,11,Giveway sign
365239,T20200019250,B,T20200019250B,2007.0,2,NW,1,Paved,V,SOLO,...,BLK,ZZ,2,5,2,1,3082.0,2,00,No control
365240,T20200019253,A,T20200019253A,2007.0,1,SW,1,Paved,V,SOLO,...,BLK,ZZ,2,5,2,1,3444.0,2,00,No control


In [10]:
# Storing file in Data folder
cleaned_data_output_path = Path("../Cleaned_Datasets/vehicle_cleaned.csv")
vehicle_data_df.to_csv(cleaned_data_output_path, index=None, header=True)