# Data Cleaning for `heart_dataset_4.csv`

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# load dataset
df = pd.read_csv('../data/raw/heart_dataset_4.csv')

display(df.head(), df.duplicated().sum(), df.isnull().sum())

Unnamed: 0,id,death,los,age,gender,cancer,cabg,crt,defib,dementia,...,pacemaker,pneumonia,prior_appts_attended,prior_dnas,pci,stroke,senile,quintile,ethnicgroup,fu_time
0,1,0,2,90,2,0,0,0,0,0,...,0,0,4,0,0,0,0,2.0,,416
1,2,0,10,74,1,0,0,0,0,0,...,0,1,9,1,0,0,0,4.0,1.0,648
2,3,0,3,83,2,0,0,0,0,0,...,0,0,1,0,0,0,0,3.0,1.0,466
3,4,0,1,79,1,0,0,0,0,0,...,1,0,9,2,1,1,0,5.0,1.0,441
4,5,0,17,94,2,0,0,0,0,0,...,0,0,3,0,0,0,0,2.0,1.0,371


0

id                       0
death                    0
los                      0
age                      0
gender                   0
cancer                   0
cabg                     0
crt                      0
defib                    0
dementia                 0
diabetes                 0
hypertension             0
ihd                      0
mental_health            0
arrhythmias              0
copd                     0
obesity                  0
pvd                      0
renal_disease            0
valvular_disease         0
metastatic_cancer        0
pacemaker                0
pneumonia                0
prior_appts_attended     0
prior_dnas               0
pci                      0
stroke                   0
senile                   0
quintile                 6
ethnicgroup             43
fu_time                  0
dtype: int64

## Start by renaming columns

In [3]:
new_column_names = ['id','HeartDisease','length_of_stay','age','sex','cancer','coronary_arterty_bypass_grafting',\
                             'cardiac_resynchronization_therapy','implantable_defibrillator','dementia','diabetes','hypertension','ischemic_heart_disease',\
                            'depression','arrythmias','chronic_obstructive_pulmonary_disease','obesity','peripheral_vascular_disease','renal_disease',\
                            'valvular_disease','metastatic_cancer','pacemaker','pneumonia','prior_appts_attended','prior_diagnosis',\
                            'percutaneous_coronary_intervention','stroke','senile','quintile','ethnic_group','follow_up_time']
df.columns = new_column_names

df.head()

Unnamed: 0,id,HeartDisease,length_of_stay,age,sex,cancer,coronary_arterty_bypass_grafting,cardiac_resynchronization_therapy,implantable_defibrillator,dementia,...,pacemaker,pneumonia,prior_appts_attended,prior_diagnosis,percutaneous_coronary_intervention,stroke,senile,quintile,ethnic_group,follow_up_time
0,1,0,2,90,2,0,0,0,0,0,...,0,0,4,0,0,0,0,2.0,,416
1,2,0,10,74,1,0,0,0,0,0,...,0,1,9,1,0,0,0,4.0,1.0,648
2,3,0,3,83,2,0,0,0,0,0,...,0,0,1,0,0,0,0,3.0,1.0,466
3,4,0,1,79,1,0,0,0,0,0,...,1,0,9,2,1,1,0,5.0,1.0,441
4,5,0,17,94,2,0,0,0,0,0,...,0,0,3,0,0,0,0,2.0,1.0,371


**Here we have null values in the "quntile" and "ethnicgroup" features.**

In [4]:
null_counts = df.isnull().sum()
null_counts[null_counts > 0]

quintile         6
ethnic_group    43
dtype: int64

**First, we want to deal with the null values in the quintile feature.** \
We deal with this by using mode imputation which replaces the null values with the most common values in the quintile column.

In [5]:
mode_quintile = df['quintile'].mode()[0]
print(f"Mode of 'quintile': {mode_quintile}")

# fill in missing values with the mode
df['quintile'] = df['quintile'].fillna(mode_quintile) 

Mode of 'quintile': 4.0


**Now deal with the null values in the ethnic group feature.** \
Based on the information on the ethnic group feature, we want to fill using the value of 8 which means the ethnic group is not known (Can be found in the documentation of the dataset on kaggle). \
[Dataset Documentation](https://www.kaggle.com/datasets/jackleenrasmybareh/heart-failure)

In [6]:
# Fill in missing values with 8 (unknown ethnic group)
df['ethnic_group'] = df['ethnic_group'].fillna(8)

In [7]:
# Check for any null values
null_counts = df.isnull().sum()
null_counts[null_counts > 0]

Series([], dtype: int64)

In [None]:
# check for categorial features
df.dtypes

id                                         int64
HeartDisease                               int64
length_of_stay                             int64
age                                        int64
sex                                        int64
cancer                                     int64
coronary_arterty_bypass_grafting           int64
cardiac_resynchronization_therapy          int64
implantable_defibrillator                  int64
dementia                                   int64
diabetes                                   int64
hypertension                               int64
ischemic_heart_disease                     int64
depression                                 int64
arrythmias                                 int64
chronic_obstructive_pulmonary_disease      int64
obesity                                    int64
peripheral_vascular_disease                int64
renal_disease                              int64
valvular_disease                           int64
metastatic_cancer   

**Since there are no categorial variables -- no need to encode.**

**Since the sex column is not binary, lets change it to binary instead of 1's and 2's**

In [8]:
# change sex feature to binary
sex_map = {1: 0, 2: 1}
df['sex'] = df['sex'].map(sex_map)

## Now, standardize features
We want to standardize los (hospital length of stay), age, follow_up_time, prior_appts_attended, prior_diagnosis

In [9]:
scaler = StandardScaler()

columns_to_scale = ['length_of_stay', 'age', 'follow_up_time', 'prior_appts_attended', 'prior_diagnosis']
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [10]:
df.head()

Unnamed: 0,id,HeartDisease,length_of_stay,age,sex,cancer,coronary_arterty_bypass_grafting,cardiac_resynchronization_therapy,implantable_defibrillator,dementia,...,pacemaker,pneumonia,prior_appts_attended,prior_diagnosis,percutaneous_coronary_intervention,stroke,senile,quintile,ethnic_group,follow_up_time
0,1,0,-0.697729,1.012523,1,0,0,0,0,0,...,0,0,-0.225025,-0.452104,0,0,0,2.0,8.0,-0.147429
1,2,0,-0.06155,-0.425082,0,0,0,0,0,0,...,0,1,0.506525,0.4503,0,0,0,4.0,1.0,0.537297
2,3,0,-0.618206,0.383571,1,0,0,0,0,0,...,0,0,-0.663955,-0.452104,0,0,0,3.0,1.0,0.000142
3,4,0,-0.777251,0.02417,0,0,0,0,0,0,...,1,0,0.506525,1.352704,1,1,0,5.0,1.0,-0.073643
4,5,0,0.495106,1.371924,1,0,0,0,0,0,...,0,0,-0.371335,-0.452104,0,0,0,2.0,1.0,-0.280242


## Now move cleaned data to processed folder

In [12]:
df.to_csv('../data/processed/heart_dataset_4_processed.csv', index=False)