# Import Dependencies

In [52]:
import pandas as pd

# Import Data

In [53]:
# Load raw data
path = r"..\dataset\WHO Mortality Database - Overview of the distribution of causes of total deaths grouped by category - Malaysia - 30th May 2025 16_35.csv"
df = pd.read_csv(path, skiprows=8, usecols=range(10)) # Skip header metadata, keep first 10 columns only
# Check data columns
df.columns

Index(['Indicator Code', 'Indicator Name', 'Year', 'Sex', 'Age group code',
       'Age Group', 'Number',
       'Percentage of cause-specific deaths out of total deaths',
       'Age-standardized death rate per 100 000 standard population',
       'Death rate per 100 000 population'],
      dtype='object')

In [54]:
# Rename Columns to shorten
df.rename(columns={
    "Percentage of cause-specific deaths out of total deaths": "Percent of All Causes",
    "Age-standardized death rate per 100 000 standard population": "Age-standardized Death Rate",
    "Death rate per 100 000 population": "Death Rate"
}, inplace=True)

# Check renamed columns
df.columns

Index(['Indicator Code', 'Indicator Name', 'Year', 'Sex', 'Age group code',
       'Age Group', 'Number', 'Percent of All Causes',
       'Age-standardized Death Rate', 'Death Rate'],
      dtype='object')

# Filter Redundant Columns

In [55]:
# Remove redundant columns
columns=['Indicator Code', 'Age group code', 'Age-standardized Death Rate']
df.drop(columns, inplace=True, axis=1)
# After removed
df.head()

Unnamed: 0,Indicator Name,Year,Sex,Age Group,Number,Percent of All Causes,Death Rate
0,All Causes,2020,All,[All],109155.0,100.0,337.252065
1,All Causes,2020,All,[0],2367.0,100.0,444.18506
2,All Causes,2020,All,[1-4],410.0,100.0,19.503136
3,All Causes,2020,All,[5-9],294.0,100.0,11.722119
4,All Causes,2020,All,[10-14],402.0,100.0,16.434815


# Check for Missing Data

In [56]:
# Missing data for each column
df.isnull().sum()

Indicator Name               0
Year                         0
Sex                          0
Age Group                    0
Number                       0
Percent of All Causes     7725
Death Rate               11700
dtype: int64

In [57]:
# Get rows with missing values
pd.set_option("display.max_colwidth", None) # Display full text for columns text
df[df.isnull().any(axis=1)]

Unnamed: 0,Indicator Name,Year,Sex,Age Group,Number,Percent of All Causes,Death Rate
20,All Causes,2020,All,[Unknown],0.000000,100.000000,
41,"Communicable, maternal, perinatal and nutritional conditions",2020,All,[Unknown],0.000000,,
62,Infectious and parasitic diseases,2020,All,[Unknown],0.000000,,
83,Tuberculosis,2020,All,[Unknown],0.000000,,
104,STDs excluding HIV,2020,All,[Unknown],0.000000,,
...,...,...,...,...,...,...,...
245615,Ill-defined injuries/accidents,2000,Female,[Unknown],2.000000,2.985075,
245636,COVID-19,2000,Female,[Unknown],0.000000,0.000000,
245637,Usability,2014,All,[All],39.656700,,
245638,Percentage of ill-defined or non-specific causes to total deaths,2014,All,[All],21.371421,,


In [58]:
# Remove unknown age group rowws with missing data
df = df[df["Age Group"] != "[Unknown]"]
# Remove footer metadata rows
df = df[~df["Indicator Name"].isin([
    "Usability",
    "Percentage of ill-defined or non-specific causes to total deaths",
    "Completeness"
])]

In [59]:
# Check for missing values 
df.isnull().values.any()

np.False_

# Check for Duplicate Data

In [60]:
df.duplicated().sum()

np.int64(0)

# Check Data Consistency

In [61]:
columns = ["Indicator Name", "Year", "Sex", "Age Group"]

for col in columns:
    print(f"\nColumn: {col}")
    unique_vals = df[col].unique()
    print(f"Number of unique values: {len(unique_vals)}")
    print("Unique values:")
    print(unique_vals)


Column: Indicator Name
Number of unique values: 190
Unique values:
['All Causes'
 'Communicable, maternal, perinatal and nutritional conditions'
 'Infectious and parasitic diseases' 'Tuberculosis' 'STDs excluding HIV'
 'Syphilis' 'Chlamydia' 'Gonorrhoea' 'Other STDs' 'HIV/AIDS'
 'Diarrhoeal diseases' 'Hookworm disease' 'Intestinal nematode infections'
 'Trachoma' 'Japanese encephalitis' 'Dengue' 'Leprosy' 'Onchocerciasis'
 'lymphatic filariasis' 'Leishmaniasis' 'Schistosomiasis' 'Chagas disease'
 'Trypanosomiasis' 'Tropical-cluster diseases' 'Malaria' 'Hepatitis C'
 'Hepatitis B' 'Meningitis' 'Tetanus' 'Measles' 'Diphtheria'
 'Poliomyelitis' 'Pertussis' 'Childhood-cluster diseases'
 'Other intestinal infections' 'Trichuriasis' 'Ascariasis'
 'Other infectious diseases' 'Respiratory infections'
 'Lower respiratory infections' 'Upper respiratory infections'
 'Otitis media' 'Maternal conditions' 'Maternal haemorrhage'
 'Maternal sepsis' 'Hypertensive disorders' 'Obstructed labour'
 'Pregn

# Feature Engineering

In [62]:
"""
Age group to category mapping:
- Infant: [0]
- Toddler: [1-4]
- Child: [5-14]
- Teenager: [15-19]
- Young Adult: [20-29]
- Adult: [30-44]
- Middle Age: [45-59]
- Senior: [60-79]
- Elderly: [80+]
- All Ages: [All]
"""

age_group_map = {
    '[0]': 'Infant',
    '[1-4]': 'Toddler',
    '[5-9]': 'Child',
    '[10-14]': 'Child',
    '[15-19]': 'Teenager',
    '[20-24]': 'Young Adult',
    '[25-29]': 'Young Adult',
    '[30-34]': 'Adult',
    '[35-39]': 'Adult',
    '[40-44]': 'Adult',
    '[45-49]': 'Middle Age',
    '[50-54]': 'Middle Age',
    '[55-59]': 'Middle Age',
    '[60-64]': 'Senior',
    '[65-69]': 'Senior',
    '[70-74]': 'Senior',
    '[75-79]': 'Senior',
    '[80-84]': 'Elderly',
    '[85+]': 'Elderly',
    '[All]': 'All Ages'
}

In [63]:
# Create Age cateogroy column
df["Age Category"] = df["Age Group"].map(age_group_map)

# Reorder the columns
cols = df.columns.tolist()
cols.remove("Age Category")
age_group_index = cols.index("Age Group")
cols.insert(age_group_index + 1, "Age Category")
df = df[cols]
df

Unnamed: 0,Indicator Name,Year,Sex,Age Group,Age Category,Number,Percent of All Causes,Death Rate
0,All Causes,2020,All,[All],All Ages,109155.0,100.0,337.252065
1,All Causes,2020,All,[0],Infant,2367.0,100.0,444.185060
2,All Causes,2020,All,[1-4],Toddler,410.0,100.0,19.503136
3,All Causes,2020,All,[5-9],Child,294.0,100.0,11.722119
4,All Causes,2020,All,[10-14],Child,402.0,100.0,16.434815
...,...,...,...,...,...,...,...,...
245631,COVID-19,2000,Female,[65-69],Senior,0.0,0.0,0.000000
245632,COVID-19,2000,Female,[70-74],Senior,0.0,0.0,0.000000
245633,COVID-19,2000,Female,[75-79],Senior,0.0,0.0,0.000000
245634,COVID-19,2000,Female,[80-84],Elderly,0.0,0.0,0.000000


# Export Processed Data

In [65]:
# Save the dataframe to CSV file
save_path = r"..\dataset\clean_data.csv"
df.to_csv(save_path, index=False)