In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# File to Load
surface_path = Path("../Datasets/ROAD_SURFACE_COND.csv")

In [3]:
# Read the CSV file
surface_data = pd.read_csv(surface_path, low_memory=False)

In [4]:
# Store it in a Dataframe
surface_data_df = pd.DataFrame(surface_data)
surface_data_df.head()

Unnamed: 0,ACCIDENT_NO,SURFACE_COND,Surface Cond Desc,SURFACE_COND_SEQ
0,T20060000010,1,Dry,1
1,T20060000018,1,Dry,1
2,T20060000022,1,Dry,1
3,T20060000023,1,Dry,1
4,T20060000026,1,Dry,1


In [5]:
# Get the total of unique accident data from vehicle table
unique_surface_data = len(surface_data_df["ACCIDENT_NO"].unique())
unique_surface_data

203708

In [6]:
# Checking the data types
column_data_types = surface_data_df.dtypes
column_data_types

ACCIDENT_NO          object
SURFACE_COND          int64
Surface Cond Desc    object
SURFACE_COND_SEQ      int64
dtype: object

In [7]:
# Find columns with missing values
columns_with_missing_values = surface_data_df.isnull().any()

# Display the columns with missing values
columns_with_missing_values

ACCIDENT_NO          False
SURFACE_COND         False
Surface Cond Desc    False
SURFACE_COND_SEQ     False
dtype: bool

In [8]:
# Group by ACCIDENT_NO and aggregate Surface Cond Desc values
grouped_surface_data = surface_data_df.groupby('ACCIDENT_NO')['Surface Cond Desc'].apply(', '.join).reset_index()

# Rename the column to indicate concatenated surface conditions
grouped_surface_data.rename(columns={'Surface Cond Desc': 'Surface_Cond_Desc'}, inplace=True)

# Display the resulting DataFrame
grouped_surface_data

Unnamed: 0,ACCIDENT_NO,Surface_Cond_Desc
0,T20060000010,Dry
1,T20060000018,Dry
2,T20060000022,Dry
3,T20060000023,Dry
4,T20060000026,Dry
...,...,...
203703,T20200019239,Dry
203704,T20200019247,Dry
203705,T20200019250,Dry
203706,T20200019253,Dry


In [9]:
# Export file in Data folder
cleaned_data_output_path = Path("../Cleaned_Datasets/road_surface_cond_cleaned.csv")
grouped_surface_data.to_csv(cleaned_data_output_path, index=None, header=True)