In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# File to Load
person_path = Path("../Datasets/PERSON.csv")

In [3]:
# Read the CSV file
person_data = pd.read_csv(person_path, low_memory=False)

In [4]:
# Store it in a Dataframe
person_data_df = pd.DataFrame(person_data)
person_data_df.head(50)

Unnamed: 0,ACCIDENT_NO,PERSON_ID,VEHICLE_ID,SEX,AGE,Age Group,INJ_LEVEL,Inj Level Desc,SEATING_POSITION,HELMET_BELT_WORN,ROAD_USER_TYPE,Road User Type Desc,LICENCE_STATE,PEDEST_MOVEMENT,POSTCODE,TAKEN_HOSPITAL,EJECTED_CODE
0,T20060000010,01,A,F,,unknown,4,Not injured,LF,1,3,Passengers,,0,3130.0,,0
1,T20060000010,02,C,M,43.0,40-49,4,Not injured,LF,1,3,Passengers,,0,7310.0,,0
2,T20060000010,03,C,M,22.0,22-25,4,Not injured,LR,1,3,Passengers,,0,7310.0,,0
3,T20060000010,A,A,M,72.0,70+,4,Not injured,D,1,2,Drivers,V,0,3130.0,,0
4,T20060000010,B,B,F,62.0,60-64,3,Other injury,D,1,2,Drivers,V,0,,N,0
5,T20060000010,C,C,M,39.0,30-39,4,Not injured,D,1,2,Drivers,V,0,3804.0,,0
6,T20060000018,01,B,F,18.0,17-21,4,Not injured,CR,9,3,Passengers,V,0,3175.0,,0
7,T20060000018,02,B,M,18.0,17-21,4,Not injured,RR,1,3,Passengers,,0,,,0
8,T20060000018,A,A,M,30.0,30-39,4,Not injured,D,1,2,Drivers,V,0,3782.0,,0
9,T20060000018,B,B,F,20.0,17-21,3,Other injury,D,1,2,Drivers,V,0,3805.0,N,0


In [5]:
# Checking the data types
person_data_df.dtypes

ACCIDENT_NO             object
PERSON_ID               object
VEHICLE_ID              object
SEX                     object
AGE                    float64
Age Group               object
INJ_LEVEL               object
Inj Level Desc          object
SEATING_POSITION        object
HELMET_BELT_WORN        object
ROAD_USER_TYPE          object
Road User Type Desc     object
LICENCE_STATE           object
PEDEST_MOVEMENT         object
POSTCODE               float64
TAKEN_HOSPITAL          object
EJECTED_CODE            object
dtype: object

In [6]:
# Changing data type into a string
person_data_df['Age Group'] = person_data_df['Age Group'].astype("string")

In [7]:
# Replace '5-Dec' with '5-12' in the 'Age_Group' column
person_data_df['Age Group'] = person_data_df['Age Group'].str.replace('5-Dec', ' 5-12')

In [8]:
unique_values = person_data_df['Age Group'].unique()

# Print the unique values
for value in unique_values:
    print(value)

unknown
40-49
22-25
70+
60-64
30-39
17-21
50-59
26-29
16-17
13-15
5-12
0-4
64-69


In [9]:
# Create a new column 'ACCIDENT_VEHICLE_ID' by iterating between 'ACCIDENT_NO' and 'VEHICLE_ID'
new_column_data = []

for index, row in person_data_df.iterrows():
    new_value = row['ACCIDENT_NO'] + row['VEHICLE_ID']
    new_column_data.append(new_value)

# Insert 'ACCIDENT_VEHICLE_ID' at column 4 in the DataFrame
person_data_df.insert(3, 'ACCIDENT_VEHICLE_ID', new_column_data)

# Display the resulting DataFrame
person_data_df.head(10)

Unnamed: 0,ACCIDENT_NO,PERSON_ID,VEHICLE_ID,ACCIDENT_VEHICLE_ID,SEX,AGE,Age Group,INJ_LEVEL,Inj Level Desc,SEATING_POSITION,HELMET_BELT_WORN,ROAD_USER_TYPE,Road User Type Desc,LICENCE_STATE,PEDEST_MOVEMENT,POSTCODE,TAKEN_HOSPITAL,EJECTED_CODE
0,T20060000010,01,A,T20060000010A,F,,unknown,4,Not injured,LF,1,3,Passengers,,0,3130.0,,0
1,T20060000010,02,C,T20060000010C,M,43.0,40-49,4,Not injured,LF,1,3,Passengers,,0,7310.0,,0
2,T20060000010,03,C,T20060000010C,M,22.0,22-25,4,Not injured,LR,1,3,Passengers,,0,7310.0,,0
3,T20060000010,A,A,T20060000010A,M,72.0,70+,4,Not injured,D,1,2,Drivers,V,0,3130.0,,0
4,T20060000010,B,B,T20060000010B,F,62.0,60-64,3,Other injury,D,1,2,Drivers,V,0,,N,0
5,T20060000010,C,C,T20060000010C,M,39.0,30-39,4,Not injured,D,1,2,Drivers,V,0,3804.0,,0
6,T20060000018,01,B,T20060000018B,F,18.0,17-21,4,Not injured,CR,9,3,Passengers,V,0,3175.0,,0
7,T20060000018,02,B,T20060000018B,M,18.0,17-21,4,Not injured,RR,1,3,Passengers,,0,,,0
8,T20060000018,A,A,T20060000018A,M,30.0,30-39,4,Not injured,D,1,2,Drivers,V,0,3782.0,,0
9,T20060000018,B,B,T20060000018B,F,20.0,17-21,3,Other injury,D,1,2,Drivers,V,0,3805.0,N,0


In [10]:
# Create a dictionary to specify the column name changes
new_column_names = {'Age Group': 'Age_Group',
                    'Inj Level Desc': 'Inj_Level_Desc',
                    'Road User Type Desc': 'Road_User_Type_Desc'
                   }

# Use the rename method to rename columns
person_data_df.rename(columns=new_column_names, inplace=True)

# Display the DataFrame with renamed columns
person_data_df

Unnamed: 0,ACCIDENT_NO,PERSON_ID,VEHICLE_ID,ACCIDENT_VEHICLE_ID,SEX,AGE,Age_Group,INJ_LEVEL,Inj_Level_Desc,SEATING_POSITION,HELMET_BELT_WORN,ROAD_USER_TYPE,Road_User_Type_Desc,LICENCE_STATE,PEDEST_MOVEMENT,POSTCODE,TAKEN_HOSPITAL,EJECTED_CODE
0,T20060000010,01,A,T20060000010A,F,,unknown,4,Not injured,LF,1,3,Passengers,,0,3130.0,,0
1,T20060000010,02,C,T20060000010C,M,43.0,40-49,4,Not injured,LF,1,3,Passengers,,0,7310.0,,0
2,T20060000010,03,C,T20060000010C,M,22.0,22-25,4,Not injured,LR,1,3,Passengers,,0,7310.0,,0
3,T20060000010,A,A,T20060000010A,M,72.0,70+,4,Not injured,D,1,2,Drivers,V,0,3130.0,,0
4,T20060000010,B,B,T20060000010B,F,62.0,60-64,3,Other injury,D,1,2,Drivers,V,0,,N,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490943,T20200019247,A,A,T20200019247A,M,21.0,17-21,2,Serious injury,D,1,2,Drivers,V,0,3224.0,Y,0
490944,T20200019250,A,A,T20200019250A,M,45.0,40-49,4,Not injured,D,1,2,Drivers,V,0,3072.0,,0
490945,T20200019250,B,B,T20200019250B,M,56.0,50-59,2,Serious injury,D,6,4,Motorcyclists,V,0,3082.0,Y,0
490946,T20200019253,A,A,T20200019253A,M,65.0,64-69,2,Serious injury,D,6,4,Motorcyclists,V,0,3444.0,Y,1


In [11]:
# Storing file in Data folder
cleaned_data_output_path = Path("../Cleaned_Datasets/person_cleaned.csv")
person_data_df.to_csv(cleaned_data_output_path, index=None, header=True)