# Data Cleaning for `heart_dataset_1.csv`

In [44]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [52]:
# load dataset
df = pd.read_csv('../data/raw/heart_dataset_1.csv')

display(df.head(), df.info(), df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


None

0

**We can see that there are no null values and no duplicated values so no need to clean those.**

## Start by renaming columns

In [53]:
new_column_names = ['age','sex','chest_pain_type', 'resting_bp','cholesterol','fasting_blood_sugar','resting_ecg','max_hr','exercise_angina',\
                             'oldpeak','st_slope','HeartDisease']
df.columns = new_column_names
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_blood_sugar,resting_ecg,max_hr,exercise_angina,oldpeak,st_slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Then by encoding categorial variables

In [54]:
# Encode 'sex' column

# turning into binary (1 = M , 0 = F)

df['sex'] = df['sex'].map({'M': 1, 'F': 0})

df[['sex']]

Unnamed: 0,sex
0,1
1,0
2,1
3,0
4,1
...,...
913,1
914,1
915,1
916,0


In [56]:
# Encode 'chest_pain_type' column

# 0 = ATA (atypical angina)
# 1 = NAP (non-anginal pain)
# 2 = ASY (asymptomatic)
# 3 = TA (typical angina)

chest_pain_mapping = {'ATA': 0, 'NAP': 1, 'ASY': 2, 'TA': 3}

df['chest_pain_type'].unique()
df['chest_pain_type'] = df['chest_pain_type'].map(chest_pain_mapping)
df[['chest_pain_type']]

Unnamed: 0,chest_pain_type
0,0
1,1
2,0
3,2
4,1
...,...
913,3
914,2
915,2
916,0


In [57]:
# Encode 'resting_ecg' column

# 0 = Normal
# 1 = ST
# 2 = LVH

resting_ecg_mapping = {'Normal': 0, 'ST': 1, 'LVH': 2}

df['resting_ecg'] = df['resting_ecg'].map(resting_ecg_mapping)
df[['resting_ecg']]

Unnamed: 0,resting_ecg
0,0
1,0
2,1
3,0
4,0
...,...
913,0
914,0
915,0
916,2


In [58]:
# Encode 'exercise_angina' column

# 0 = N
# 1 = Y

exercise_angina_mapping = {'N': 0, 'Y': 1}

df['exercise_angina'] = df['exercise_angina'].map(exercise_angina_mapping)
df[['exercise_angina']]

Unnamed: 0,exercise_angina
0,0
1,0
2,0
3,1
4,0
...,...
913,0
914,0
915,1
916,0


In [59]:
# Encode 'st_slope' column

# 0 = Up
# 1 = Flat
# 2 = Down

st_slope_mapping = {'Up': 0, 'Flat': 1, 'Down': 2}
df['st_slope'] = df['st_slope'].map(st_slope_mapping)
df[['st_slope']]

Unnamed: 0,st_slope
0,0
1,1
2,0
3,1
4,0
...,...
913,1
914,1
915,1
916,1


In [60]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_blood_sugar,resting_ecg,max_hr,exercise_angina,oldpeak,st_slope,HeartDisease
0,40,1,0,140,289,0,0,172,0,0.0,0,0
1,49,0,1,160,180,0,0,156,0,1.0,1,1
2,37,1,0,130,283,0,1,98,0,0.0,0,0
3,48,0,2,138,214,0,0,108,1,1.5,1,1
4,54,1,1,150,195,0,0,122,0,0.0,0,0


## Now standardize features
### Age, RestingBP, Cholesterol, MaxHR, oldpeak

scaler = StandardScaler()

columns_so_scale = ['age', 'resting_bp', 'cholesterol', 'max_hr', 'oldpeak']
df[columns_so_scale] = scaler.fit_transform(df[columns_so_scale])

In [63]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_blood_sugar,resting_ecg,max_hr,exercise_angina,oldpeak,st_slope,HeartDisease
0,-1.43314,1,0,0.410909,0.82507,0,0,1.382928,0,-0.832432,0,0
1,-0.478484,0,1,1.491752,-0.171961,0,0,0.754157,0,0.105664,1,1
2,-1.751359,1,0,-0.129513,0.770188,0,1,-1.525138,0,-0.832432,0,0
3,-0.584556,0,2,0.302825,0.13904,0,0,-1.132156,1,0.574711,1,1
4,0.051881,1,1,0.951331,-0.034755,0,0,-0.581981,0,-0.832432,0,0


## Now cleaned data to processed folder

In [65]:
df.to_csv('../data/processed/heart_dataset_1_processed.csv', index=False)