In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

df = pd.read_csv('Applied Machine Learning DS - Cleaned DS.csv')

In [2]:
#Some rows in the ds are; ,,,,,,,,,,, -With no data, remove them
empty_rows = df[df.isna().all(axis=1)]
print(empty_rows)

df = df.dropna(how='all')
#df.iloc[150:171]
df.head()

     id source_name title author_name publish_date accident_date content tags  \
156 NaN         NaN   NaN         NaN          NaN           NaN     NaN  NaN   
158 NaN         NaN   NaN         NaN          NaN           NaN     NaN  NaN   

    day_of_the_week time  ...  senior_driver_involved  child_involved  \
156             NaN  NaN  ...                     NaN             NaN   
158             NaN  NaN  ...                     NaN             NaN   

    senior_involved visibility wind_speed temperature  severity  \
156             NaN        NaN        NaN         NaN       NaN   
158             NaN        NaN        NaN         NaN       NaN   

    ambulance_called  additional_source_name  additional_url  
156              NaN                     NaN             NaN  
158              NaN                     NaN             NaN  

[2 rows x 35 columns]


Unnamed: 0,id,source_name,title,author_name,publish_date,accident_date,content,tags,day_of_the_week,time,...,senior_driver_involved,child_involved,senior_involved,visibility,wind_speed,temperature,severity,ambulance_called,additional_source_name,additional_url
0,1.0,police,Collision between a car and a motorbike in Żur...,,09/10/2025,09/10/2025,"Today, at around 0930hrs, the Police were info...",{},Thursday,morning,...,1.0,0.0,1.0,≥ 10.00,3.7,23.0,high,1.0,Weatherspark,https://weatherspark.com
1,2.0,police,Car-motorcycle traffic accident,,19/06/2025,18/06/2025,"Yesterday, at around 1830hrs, the Police were ...",{},Wednesday,evening,...,1.0,0.0,1.0,10,11.1,26.0,high,1.0,,
2,3.0,police,Car-motorcycle collision in Ħal Qormi,,12/05/2025,12/05/2025,"Today, at around 0800hrs, the Police were info...",{},Monday,morning,...,0.0,0.0,0.0,10,27.8,20.0,high,1.0,,
3,4.0,police,Collision between motorcycle and car in Għaxaq,,30/07/2025,29/07/2025,"Yesterday, at around 1800hrs, the Police were ...",{},Tuesday,evening,...,0.0,0.0,0.0,10,38.9,26.0,high,1.0,,
4,5.0,police,Car-motorcycle collision,,07/04/2025,06/04/2025,"Yesterday, at around quarter to nine in the ev...",{},Sunday,evening,...,0.0,0.0,0.0,10,18.5,15.0,high,1.0,,


In [3]:
#Fill in where rcords where some data is empty with the mean mode and median of the ds - Imputation

#Day of the Week
df['day_of_the_week'] = df['day_of_the_week'].fillna(df['day_of_the_week'].mode()[0])

#df[df['day_of_the_week'].isna()]['day_of_the_week'] #Show when fields are empty

#Time
df['time'] = df['time'].fillna(df['time'].mode()[0])
df['time'] = df['time'].str.lower()
#df[df['time'].isna()]['time']


#Time in Hours
df['time_in_hours'] = (
    (df['time_in_hours']/10)
    .astype(str)
    .str.replace(r'[^0-9]', '', regex=True)
    .str.zfill(4)
    .str[:4]
)
df['time_in_hours'].replace('', np.nan, inplace=True)
df['time_in_hours'] = pd.to_numeric(df['time_in_hours'], errors='coerce')
df['time_in_hours'] = df['time_in_hours'].fillna(df['time_in_hours'].median())
#df[df['time_in_hours'].isna()]['time_in_hours'] 

#Rush Hour
#Check the time in hours for updated records and set the rush hour to be 1 if the median set is between 0600-0800 or 1600-1600
df["rush_hour"] = (
    ((df["time_in_hours"] >= 600) & (df["time_in_hours"] <= 800)) |
    ((df["time_in_hours"] >= 1600) & (df["time_in_hours"] <= 1800))
).astype(int)
#df[df['rush_hour'].isna()]['rush_hour'] 

#Weekend Weekday
weekday_days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
weekend_days = ["Saturday", "Sunday"]

df["weekend_weekday"] = df["weekend_weekday"].replace("", pd.NA)

# Fill missing values based on day_of_the_week
df.loc[df["weekend_weekday"].isna(), "weekend_weekday"] = df["day_of_the_week"].apply(
    lambda x: "weekday" if x in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    else "weekend"
)
df[df['weekend_weekday'].isna()]['weekend_weekday'] 

#visibility
df["visibility"] = (
    df["visibility"]
    .astype(str)
    .str.replace("≥", "", regex=False)
    .str.strip()
    .replace("", pd.NA)
    .astype(float)
)

df["visibility"] = df["visibility"].astype(float)
median_visibility = df["visibility"].median()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_in_hours'].replace('', np.nan, inplace=True)


In [4]:
# Normalising road types
df['road_type'].value_counts()

road_type
bypass                100
main road              99
residential street     34
residential            15
roundabout             13
junction                4
motorway                1
Name: count, dtype: int64

In [5]:
df['road_type'] = df['road_type'].apply(lambda x: x if x != 'residential street' else 'residential')

In [6]:
df['time'].unique()

array(['morning', 'evening', 'afternoon', 'night'], dtype=object)

In [7]:
df['locality'].value_counts()

locality
Naxxar           20
Marsa            15
St Paul's Bay    10
Msida             9
Rabat             8
                 ..
Gharb             1
Bugibba           1
Xemxija           1
Marfa             1
Għajnsielem       1
Name: count, Length: 92, dtype: int64

In [8]:
# Cleaning any Maltese characters from locality and road_name fields

# Maltese → ASCII map
MALTESE_MAP = str.maketrans({
    "ċ": "c", "Ċ": "C",
    "ġ": "g", "Ġ": "G",
    "ħ": "h", "Ħ": "H",
    "ż": "z", "Ż": "Z",
    "à": "a", "’": "'"
})

def clean_maltese_text(text: str) -> str:
    if isinstance(text, str):
      cleaned = text.translate(MALTESE_MAP)
      return cleaned.strip()
    return str(text).strip()

df['road_name'] = df['road_name'].apply(clean_maltese_text)
df['locality'] = df['locality'].apply(clean_maltese_text)

In [9]:
road_names_count = df['locality'].value_counts()

list(road_names_count.index)

['Naxxar',
 'Marsa',
 "St Paul's Bay",
 'Msida',
 'Zejtun',
 'Fgura',
 'Zebbug',
 'Rabat',
 'Floriana',
 'Paola',
 'Birkirkara',
 'Mellieha',
 'Luqa',
 'Attard',
 'Sliema',
 'Gzira',
 'Gudja',
 'Siggiewi',
 'nan',
 'Mosta',
 'Birzebbuga',
 'Xewkija',
 'Zabbar',
 'St Julians',
 'Marsascala',
 'Mgarr',
 'Hamrun',
 'Victoria',
 'Pembroke',
 'Ghajnsielem',
 'San Gwann',
 'Marsaxlokk',
 'Zurrieq',
 'Qormi',
 'Tarxien',
 'Mtarfa',
 'Pieta',
 'Cospicua',
 'Santa Venera',
 'St Pauls',
 "St Julian's",
 'Iklin',
 'Lija',
 'Bugibba',
 'St Pauls Bay',
 'Gharb',
 'Hal Ghaxaq',
 'Xaghra',
 'Bahar ic-Caghaq',
 'Gwardamanga',
 'Zebbug (Gozo)',
 'Catania, Sicily',
 'Qala, Gozo',
 'Victoria, Gozo',
 'Bidnija',
 'Zebbug, Gozo',
 'Rabat, Gozo',
 'Mriehel',
 'Il-Gudja',
 'Qala',
 'Poala',
 'Hal-Far',
 'Senglea',
 'Marfa',
 'Xemxija',
 'Xbiex',
 'Rahal Gdid',
 'Imqabba',
 'mellieha',
 'Balzan',
 'Nadur',
 'Il-Marsa',
 'Kirkop',
 'Bunica, Croatia']

In [10]:
# Treat empty strings as missing values
df = df.replace("", pd.NA)

# Column groups
categorical_cols = [
    "day_of_the_week",
    "locality",
    "road_name",
    "road_type",
    "vehicle_involved",
    "severity"
]

binary_cols = [
    "young_driver_involved",
    "adult_driver_involved",
    "senior_driver_involved",
    "child_involved",
    "senior_involved",
    "ambulance_called"
]

numeric_cols_mean = [
    "wind_speed",
    "temperature"
]

numeric_cols_median = [
    "vehicle_count",
    "male_count",
    "female_count",
    "total_driver_count",
    "min_age",
    "max_age"
]

# Impute categorical values
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Impute binary columns with mode
for col in binary_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Impute numeric (mean)
for col in numeric_cols_mean:
    df[col] = df[col].astype(float)
    df[col] = df[col].fillna(df[col].mean())

# Impute numeric (median)
for col in numeric_cols_median:
    df[col] = df[col].astype(float)
    df[col] = df[col].fillna(df[col].median())

In [11]:
#Checking final sum of empty records
df.isna().sum()

id                          0
source_name                 0
title                       0
author_name               114
publish_date                0
accident_date               6
content                     0
tags                        1
day_of_the_week             0
time                        0
time_in_hours               0
rush_hour                   0
weekend_weekday             0
locality                    0
road_name                   0
road_type                   0
vehicle_count               0
vehicle_involved            0
male_count                  0
female_count                0
total_driver_count          0
min_age                     0
max_age                     0
young_driver_involved       0
adult_driver_involved       0
senior_driver_involved      0
child_involved              0
senior_involved             0
visibility                 17
wind_speed                  0
temperature                 0
severity                    0
ambulance_called            0
additional

In [12]:
df.head()

Unnamed: 0,id,source_name,title,author_name,publish_date,accident_date,content,tags,day_of_the_week,time,...,senior_driver_involved,child_involved,senior_involved,visibility,wind_speed,temperature,severity,ambulance_called,additional_source_name,additional_url
0,1.0,police,Collision between a car and a motorbike in Żur...,,09/10/2025,09/10/2025,"Today, at around 0930hrs, the Police were info...",{},Thursday,morning,...,1.0,0.0,1.0,10.0,3.7,23.0,high,1.0,Weatherspark,https://weatherspark.com
1,2.0,police,Car-motorcycle traffic accident,,19/06/2025,18/06/2025,"Yesterday, at around 1830hrs, the Police were ...",{},Wednesday,evening,...,1.0,0.0,1.0,10.0,11.1,26.0,high,1.0,,
2,3.0,police,Car-motorcycle collision in Ħal Qormi,,12/05/2025,12/05/2025,"Today, at around 0800hrs, the Police were info...",{},Monday,morning,...,0.0,0.0,0.0,10.0,27.8,20.0,high,1.0,,
3,4.0,police,Collision between motorcycle and car in Għaxaq,,30/07/2025,29/07/2025,"Yesterday, at around 1800hrs, the Police were ...",{},Tuesday,evening,...,0.0,0.0,0.0,10.0,38.9,26.0,high,1.0,,
4,5.0,police,Car-motorcycle collision,,07/04/2025,06/04/2025,"Yesterday, at around quarter to nine in the ev...",{},Sunday,evening,...,0.0,0.0,0.0,10.0,18.5,15.0,high,1.0,,


In [13]:
numeric_cols = [
    "vehicle_count",
    "male_count",
    "female_count",
    "total_driver_count",
    "min_age",
    "max_age",
    "wind_speed",
    "temperature"
]

updated_counts = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    before = df[(df[col] < lower) | (df[col] > upper)][col].count()
    
    # Apply the capping
    df[col] = np.where(df[col] < lower, lower,
                np.where(df[col] > upper, upper, df[col]))
    
    updated_counts[col] = before

updated_counts

{'vehicle_count': 5,
 'male_count': 8,
 'female_count': 9,
 'total_driver_count': 2,
 'min_age': 7,
 'max_age': 0,
 'wind_speed': 22,
 'temperature': 0}

In [14]:
output_file = "cleaned_dataset.csv"

# If file exists → remove it
if os.path.exists(output_file):
    os.remove(output_file)
df.to_csv("cleaned_dataset.csv", index=False)