In [1]:
# Import required libraries
import pandas as pd
import numpy as np

In [3]:
# Step 1: Load the Dataset
# Replace 'Addis_Ababa_city_RTA.csv' with your actual file path
df = pd.read_csv('Addis Abbaba City Data set.csv')

In [4]:
# Step 2: Identify Missing Values
# Convert 'na', 'Unknown', and empty strings to np.nan
df.replace(['na', 'Unknown', ''], np.nan, inplace=True)

In [5]:
# Display initial missing value counts
print("Missing Values Before Handling:\n", df.isnull().sum())

Missing Values Before Handling:
 Time                             0
Day_of_week                      0
Age_band_of_driver             143
Drivers_gender                   0
Educational_level               14
Vehicle_driver_relation          6
Driving_experience              21
Type_of_vehicle                  0
Owner_of_vehicle                 0
Service_year_of_vehicle        352
Defect_of_vehicle              348
Area_accident_occured           14
Lanes_or_Medians                27
Road_allignment                  6
Types_of_Junction                0
Road_surface_type                0
Road_surface_conditions          0
Light_conditions                 0
Weather_conditions              17
Type_of_collision               14
Number_of_vehicles_involved      0
Number_of_casualties             0
Vehicle_movement                18
Casualty_class                 384
Casualty_gender                384
Age_band_of_casualty           384
Work_of_casuality                4
Fitness_of_casuality  

In [6]:
# Display percentage of missing values per column
missing_percent = df.isnull().sum() / len(df) * 100
print("\nPercentage of Missing Values:\n", missing_percent)


Percentage of Missing Values:
 Time                            0.000000
Day_of_week                     0.000000
Age_band_of_driver             13.684211
Drivers_gender                  0.000000
Educational_level               1.339713
Vehicle_driver_relation         0.574163
Driving_experience              2.009569
Type_of_vehicle                 0.000000
Owner_of_vehicle                0.000000
Service_year_of_vehicle        33.684211
Defect_of_vehicle              33.301435
Area_accident_occured           1.339713
Lanes_or_Medians                2.583732
Road_allignment                 0.574163
Types_of_Junction               0.000000
Road_surface_type               0.000000
Road_surface_conditions         0.000000
Light_conditions                0.000000
Weather_conditions              1.626794
Type_of_collision               1.339713
Number_of_vehicles_involved     0.000000
Number_of_casualties            0.000000
Vehicle_movement                1.722488
Casualty_class           

In [7]:
# Step 3: Drop Columns with Excessive Missing Values (e.g., >50%)
threshold = 0.5 * len(df)  # 50% threshold
df = df.dropna(thresh=threshold, axis=1)  # Drop columns with >50% missing
print("\nColumns Dropped Due to >50% Missing Values:", [col for col in df.columns if col not in df.columns])


Columns Dropped Due to >50% Missing Values: []


In [8]:
# Step 4: Handle Remaining Missing Values
# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

In [9]:
# Impute numerical columns with median (robust to outliers)
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [10]:
# Impute categorical columns with mode (most frequent value)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [11]:
# Optionally, handle missing target variable (Accident_severity) by dropping rows
# Only do this if missingness is minimal and target is critical
if 'Accident_severity' in df.columns:
    df = df.dropna(subset=['Accident_severity'])

In [12]:
# Verify No Missing Values Remain
print("\nMissing Values After Handling:\n", df.isnull().sum())


Missing Values After Handling:
 Time                           0
Day_of_week                    0
Age_band_of_driver             0
Drivers_gender                 0
Educational_level              0
Vehicle_driver_relation        0
Driving_experience             0
Type_of_vehicle                0
Owner_of_vehicle               0
Service_year_of_vehicle        0
Defect_of_vehicle              0
Area_accident_occured          0
Lanes_or_Medians               0
Road_allignment                0
Types_of_Junction              0
Road_surface_type              0
Road_surface_conditions        0
Light_conditions               0
Weather_conditions             0
Type_of_collision              0
Number_of_vehicles_involved    0
Number_of_casualties           0
Vehicle_movement               0
Casualty_class                 0
Casualty_gender                0
Age_band_of_casualty           0
Work_of_casuality              0
Fitness_of_casuality           0
Pedestrian_movement            0
Cause_of_a

In [13]:
#Inspect Data After Handling
print("\nFirst 5 Rows After Handling Missing Data:\n", df.head())
print("\nSummary Stats After Handling:\n", df.describe())


First 5 Rows After Handling Missing Data:
        Time Day_of_week Age_band_of_driver Drivers_gender   Educational_level  \
0  17:20:00      Friday              18-30           Male  Junior high school   
1  17:45:00    Thursday              18-30           Male  Junior high school   
2  17:45:00    Thursday              31-50           Male  Junior high school   
3  18:36:00   Wednesday              18-30           Male  Junior high school   
4  14:35:00      Friday              18-30           Male   Above high school   

  Vehicle_driver_relation Driving_experience      Type_of_vehicle  \
0                Employee         Above 10yr      Lorry (41?100Q)   
1                Employee              1-2yr  Public (> 45 seats)   
2                Employee             5-10yr      Lorry (41?100Q)   
3                Employee              2-5yr                 Taxi   
4                Employee              2-5yr           Automobile   

  Owner_of_vehicle Service_year_of_vehicle  ... Number

In [None]:
--------------Member 02 - Encoding Categorical Varieables-----------------