# Data Cleaning for `heart_dataset_2.csv`

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [3]:
# load dataset
df = pd.read_csv('../data/raw/heart_dataset_2.csv')

display(df.head(), df.info(), df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


None

0

**We can see that there are no null values and no duplicated values so no need to clean those.**

## Start by renaming columns

In [4]:
new_column_names = ['age','amaemia','creatinine_phosphokinase','diabetes','ejection_fraction','hypertension','platelet_count','serum_creatinine',\
                            'serum_sodium','sex','smoking','time','HeartDisease']
df.columns = new_column_names
df.head()

Unnamed: 0,age,amaemia,creatinine_phosphokinase,diabetes,ejection_fraction,hypertension,platelet_count,serum_creatinine,serum_sodium,sex,smoking,time,HeartDisease
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## Now standardize features
### Age, RestingBP, Cholesterol, MaxHR, oldpeak

In [5]:
scaler = StandardScaler()

columns_so_scale = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelet_count', 'serum_creatinine','serum_sodium','time']
df[columns_so_scale] = scaler.fit_transform(df[columns_so_scale])

In [6]:
df.head()

Unnamed: 0,age,amaemia,creatinine_phosphokinase,diabetes,ejection_fraction,hypertension,platelet_count,serum_creatinine,serum_sodium,sex,smoking,time,HeartDisease
0,1.192945,0,0.000166,0,-1.53056,1,0.01681648,0.490057,-1.504036,1,0,-1.629502,1
1,-0.491279,0,7.51464,0,-0.007077,0,7.53566e-09,-0.284552,-0.141976,1,0,-1.603691,1
2,0.350833,0,-0.449939,0,-1.53056,0,-1.038073,-0.0909,-1.731046,1,1,-1.590785,1
3,-0.912335,1,-0.486071,0,-1.53056,0,-0.5464741,0.490057,0.085034,1,0,-1.590785,1
4,0.350833,1,-0.435486,1,-1.53056,0,0.6517986,1.264666,-4.682176,0,0,-1.577879,1


## Now cleaned data to processed folder

In [7]:
df.to_csv('../data/processed/heart_dataset_2_processed.csv', index=False)