# Data Cleaning for `heart_dataset_3.csv`

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# load dataset
df = pd.read_csv('../data/raw/heart_dataset_3.csv')

display(df.head(), df.info(), df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 60 columns):
 #   Column                                                                                    Non-Null Count  Dtype  
---  ------                                                                                    --------------  -----  
 0   Age                                                                                       368 non-null    int64  
 1   Age.Group                                                                                 368 non-null    object 
 2   Gender                                                                                    368 non-null    object 
 3   Locality                                                                                  368 non-null    object 
 4   Marital status                                                                            368 non-null    object 
 5   Life.Style                                               

Unnamed: 0,Age,Age.Group,Gender,Locality,Marital status,Life.Style,Sleep,Category,Depression,Hyperlipi,...,oldpeak,slope,ca,thal,num,SK,SK.React,Reaction,Mortality,Follow.Up
0,45,41-50,Female,RURAL,MARRIED,NO,NO,FREE,YES,YES,...,3.0,2,0,7,2,1,NO,0,0,60
1,51,51-60,Female,URBAN,MARRIED,NO,NO,FREE,YES,YES,...,1.2,2,0,7,2,1,NO,0,0,15
2,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,3.4,2,0,3,2,1,NO,0,0,6
3,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,2.0,2,1,7,3,1,NO,0,0,52
4,56,51-60,Female,RURAL,MARRIED,YES,NO,FREE,YES,YES,...,4.0,3,2,7,3,1,NO,0,0,34


None

214

In [3]:
df = df.drop_duplicates()
print(df.duplicated().sum())

0


**We can see that there are no null values and And the duplicated values have been removed.**

## Start by renaming columns

In [4]:
new_column_names = ['age','age_group','sex','locality','marital_status','life_style','sleep','category','depression','hyperlipi','smoking','family_history',\
                            'f_history','diabetes','hypertension','allergies','blood_pressure','thrombolysis','blood_glucose_random','blood_urea','serum_creatinine',\
                            'serum_sodium','serum_potassium','serum_chloride','creatinine_phosphokinase','creatinine_kinase','sedimentation_rate',\
                             'white_blood_cell_count','red_blood_cell_count','hemoglobin','packed_cell_volume','red_cell_volume','red_cell_hemoglobin',\
                            'red_cell_hemoglobin_concentration','platelet_count','neutrophil','lympho','monocyte','eosino','others','CO','diagnosis','hypersensitivity',\
                            'chest_pain_type','resting_bp','cholestorl','fasting_blood_sugar','resting_ecg','max_hr','exercise_angina','oldpeak','st_slope','ca',\
                            'thalassemia','heart_disease_severity','streptokinase_therapy','streptokinase_reaction','reaction','HeartDisease','follow_up']
df.columns = new_column_names
df.head()

Unnamed: 0,age,age_group,sex,locality,marital_status,life_style,sleep,category,depression,hyperlipi,...,oldpeak,st_slope,ca,thalassemia,heart_disease_severity,streptokinase_therapy,streptokinase_reaction,reaction,HeartDisease,follow_up
0,45,41-50,Female,RURAL,MARRIED,NO,NO,FREE,YES,YES,...,3.0,2,0,7,2,1,NO,0,0,60
1,51,51-60,Female,URBAN,MARRIED,NO,NO,FREE,YES,YES,...,1.2,2,0,7,2,1,NO,0,0,15
2,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,3.4,2,0,3,2,1,NO,0,0,6
3,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,2.0,2,1,7,3,1,NO,0,0,52
4,56,51-60,Female,RURAL,MARRIED,YES,NO,FREE,YES,YES,...,4.0,3,2,7,3,1,NO,0,0,34


## Then by encoding categorial variables

In [5]:
df.keys()

Index(['age', 'age_group', 'sex', 'locality', 'marital_status', 'life_style',
       'sleep', 'category', 'depression', 'hyperlipi', 'smoking',
       'family_history', 'f_history', 'diabetes', 'hypertension', 'allergies',
       'blood_pressure', 'thrombolysis', 'blood_glucose_random', 'blood_urea',
       'serum_creatinine', 'serum_sodium', 'serum_potassium', 'serum_chloride',
       'creatinine_phosphokinase', 'creatinine_kinase', 'sedimentation_rate',
       'white_blood_cell_count', 'red_blood_cell_count', 'hemoglobin',
       'packed_cell_volume', 'red_cell_volume', 'red_cell_hemoglobin',
       'red_cell_hemoglobin_concentration', 'platelet_count', 'neutrophil',
       'lympho', 'monocyte', 'eosino', 'others', 'CO', 'diagnosis',
       'hypersensitivity', 'chest_pain_type', 'resting_bp', 'cholestorl',
       'fasting_blood_sugar', 'resting_ecg', 'max_hr', 'exercise_angina',
       'oldpeak', 'st_slope', 'ca', 'thalassemia', 'heart_disease_severity',
       'streptokinase_therapy

In [6]:
# Encode 'sex' column

# turning into binary (1 = M , 0 = F)

df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})
print(df['sex'].unique())
df[['sex']]

[0 1]


Unnamed: 0,sex
0,0
1,0
2,0
3,0
4,0
...,...
149,1
150,0
151,1
152,1


In [7]:
df = df.drop('age_group',axis='columns')

In [8]:
df['locality'].unique()

array(['RURAL', 'URBAN'], dtype=object)

In [9]:
df['locality'] = df['locality'].map({'RURAL': 1, 'URBAN': 0})
print(df['locality'].unique())
df[['locality']]

[1 0]


Unnamed: 0,locality
0,1
1,0
2,1
3,1
4,1
...,...
149,1
150,1
151,0
152,0


In [10]:
df['marital_status'].unique()

array(['MARRIED', 'SINGLE'], dtype=object)

In [11]:
df['marital_status'] = df['marital_status'].map({'MARRIED': 1, 'SINGLE': 0})
print(df['marital_status'].unique())
df[['marital_status']]

[1 0]


Unnamed: 0,marital_status
0,1
1,1
2,1
3,1
4,1
...,...
149,1
150,1
151,1
152,1


In [12]:
df['life_style'].unique()

array(['NO', 'YES'], dtype=object)

In [13]:
df = df.drop('life_style',axis='columns')

In [14]:
df['sleep'].unique()

array(['NO', 'YES'], dtype=object)

In [15]:
df['sleep'] = df['sleep'].map({'YES': 1, 'NO': 0})
print(df['sleep'].unique())
df[['sleep']]

[0 1]


Unnamed: 0,sleep
0,0
1,0
2,1
3,1
4,0
...,...
149,0
150,0
151,0
152,0


In [16]:
df['category'].unique()

array(['FREE', 'PAID'], dtype=object)

In [17]:
df = df.drop('category',axis='columns')

In [18]:
df['depression'].unique()

array(['YES', 'NO'], dtype=object)

In [19]:
df['depression'] = df['depression'].map({'YES': 1, 'NO': 0})
print(df['depression'].unique())
df[['depression']]

[1 0]


Unnamed: 0,depression
0,1
1,1
2,1
3,1
4,1
...,...
149,1
150,1
151,1
152,0


In [20]:
df['hyperlipi'].unique()

array(['YES', 'NO'], dtype=object)

In [21]:
df['hyperlipi'] = df['hyperlipi'].map({'YES': 1, 'NO': 0})
print(df['hyperlipi'].unique())
df[['hyperlipi']]

[1 0]


Unnamed: 0,hyperlipi
0,1
1,1
2,1
3,1
4,1
...,...
149,1
150,1
151,1
152,0


In [22]:
df['smoking'].unique()

array(['NO', 'YES'], dtype=object)

In [23]:
df['smoking'] = df['smoking'].map({'YES': 1, 'NO': 0})
print(df['smoking'].unique())
df[['smoking']]

[0 1]


Unnamed: 0,smoking
0,0
1,0
2,0
3,0
4,0
...,...
149,1
150,0
151,0
152,0


In [24]:
df['family_history'].unique()

array(['NO', 'YES'], dtype=object)

In [25]:
df['family_history'] = df['family_history'].map({'YES': 1, 'NO': 0})
print(df['family_history'].unique())
df[['family_history']]

[0 1]


Unnamed: 0,family_history
0,0
1,0
2,0
3,0
4,0
...,...
149,0
150,0
151,0
152,0


In [26]:
df['hypertension'].unique()

array(['NO', 'YES'], dtype=object)

In [27]:
df['hypertension'] = df['hypertension'].map({'YES': 1, 'NO': 0})
print(df['hypertension'].unique())
df[['hypertension']]

[0 1]


Unnamed: 0,hypertension
0,0
1,0
2,1
3,1
4,1
...,...
149,1
150,1
151,1
152,0


In [28]:
df['allergies'].unique()

array(['NO', 'YES'], dtype=object)

In [29]:
df['allergies'] = df['allergies'].map({'YES': 1, 'NO': 0})
print(df['allergies'].unique())
df[['allergies']]

[0 1]


Unnamed: 0,allergies
0,0
1,0
2,0
3,0
4,0
...,...
149,0
150,1
151,0
152,0


In [30]:
df['CO'].unique()

array(['Chest pain,', 'Central Chest pain,',
       'Chest pain,SOB, Cold sweating', 'CENTRAL Chest pain,',
       'SOB FROM 1 DAY ', 'Chest pain, SWEATING', 'Chest pain,SOB',
       'Chest pain 1 hr', 'Chest pain, VOMITING',
       'Chest pain,heart sinking, vomiting',
       'Chest pain,nausea,vertigo,sweating,headache', 'Chest pain, 2 HR',
       'Chest pain,NAUSEA, SWELLOWING',
       'Chest pain,COLD SWEATING, VOMITING', 'Chest pain,Sweating, ',
       'Chest pain,vomiting, sweating',
       'SOB, DIZZINESS, CHEST PAIN,NAUSEA,DIAPHORESIS',
       'Chest pain, 4 HR', 'Chest pain,SOB, HEART SINKING , PALPITATION',
       'Chest pain4 HR, SWEATING', 'Chest pain from 3 hr',
       'Central Chest pain,SOB, Sweating', 'Chest pain,sweating,vomiting',
       'Chest pain,VOMITING.,SOB', 'Chest pain, Sweating, SOB',
       'Chest pain,COLD SWEATING, ', 'Chest pain,COUGH',
       'Chest pain for 2 to 3 hr', 'Chest pain,SWEATING, NAUSEA',
       'Chest pain, VERTIGO,SWEATING', 'Chest pain, TI

### Something to note here, 'CO' Column may be too complicated to parse at this moment

### Revisit Later

In [31]:
df = df.drop('CO',axis='columns')

In [32]:
df = df.drop('diagnosis',axis='columns')

In [33]:
df['hypersensitivity'].unique()

array(['NO', 'YES'], dtype=object)

In [34]:
df['hypersensitivity'] = df['hypersensitivity'].map({'YES': 1, 'NO': 0})
print(df['hypersensitivity'].unique())
df[['hypersensitivity']]

[0 1]


Unnamed: 0,hypersensitivity
0,0
1,0
2,0
3,0
4,0
...,...
149,1
150,1
151,1
152,0


In [35]:
df = df.drop('streptokinase_reaction',axis='columns')

In [36]:
df = df.drop('others',axis='columns')

In [37]:
df = df.drop('follow_up',axis='columns')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 154 entries, 0 to 153
Data columns (total 52 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   age                                154 non-null    int64  
 1   sex                                154 non-null    int64  
 2   locality                           154 non-null    int64  
 3   marital_status                     154 non-null    int64  
 4   sleep                              154 non-null    int64  
 5   depression                         154 non-null    int64  
 6   hyperlipi                          154 non-null    int64  
 7   smoking                            154 non-null    int64  
 8   family_history                     154 non-null    int64  
 9   f_history                          154 non-null    int64  
 10  diabetes                           154 non-null    int64  
 11  hypertension                       154 non-null    int64  
 12 

In [39]:
for i in df.keys():
    print(i, ": ",df[i].unique())

age :  [45 51 55 56 57 58 59 60 61 62 63 65 66 24 25 35 38 40 44 46 47 50 52 53
 54 64 67 68 69 70 77]
sex :  [0 1]
locality :  [1 0]
marital_status :  [1 0]
sleep :  [0 1]
depression :  [1 0]
hyperlipi :  [1 0]
smoking :  [0 1]
family_history :  [0 1]
f_history :  [0 1]
diabetes :  [1 0]
hypertension :  [0 1]
allergies :  [0 1]
blood_pressure :  [100.6   90.6  100.7  160.1  140.7  120.8  130.8  160.9  140.9  110.7
 190.11 160.7  150.9   80.5  120.9  130.9  120.7 ]
thrombolysis :  [0 1]
blood_glucose_random :  [ 84 135 146  85 166  96 129 117 164 213 320 152 115 280 210  60 251 363
 260 125 208 127 323 110 291 426  89 131 348 563 392  86 102 137 153 463
 231 512]
blood_urea :  [ 28.   17.   37.   78.  104.   42.   55.   30.   25.   36.   31.   35.
  40.  394.   33.    2.3  39.   41.   54.   46.   24.   26.   43.   47.
 162.   61.   38.   27. ]
serum_creatinine :  [ 0.9   0.7   1.    1.2   4.    1.1  22.9   0.8   0.6   1.08  2.7   1.6
  1.3 ]
serum_sodium :  [138 144 137 139 130 146 136

## Now standardize features
### Age, RestingBP, Cholesterol, MaxHR, oldpeak, etc

In [40]:
scaler = StandardScaler()

columns_so_scale = ['age', 'blood_pressure', 'blood_glucose_random', 'blood_urea', 'serum_creatinine','serum_sodium','serum_potassium',\
                    'serum_chloride','creatinine_phosphokinase','creatinine_kinase','sedimentation_rate','white_blood_cell_count',\
                   'red_blood_cell_count','hemoglobin','packed_cell_volume','red_cell_volume','red_cell_hemoglobin',\
                   'red_cell_hemoglobin_concentration','platelet_count','neutrophil','lympho','monocyte','eosino','resting_bp',\
                   'cholestorl','max_hr','oldpeak']
df[columns_so_scale] = scaler.fit_transform(df[columns_so_scale])

In [41]:
df.head()

Unnamed: 0,age,sex,locality,marital_status,sleep,depression,hyperlipi,smoking,family_history,f_history,...,max_hr,exercise_angina,oldpeak,st_slope,ca,thalassemia,heart_disease_severity,streptokinase_therapy,reaction,HeartDisease
0,-1.195961,0,1,1,0,1,1,0,0,0,...,-0.177879,1,1.133205,2,0,7,2,1,0,0
1,-0.529856,0,0,1,0,1,1,0,0,0,...,0.089373,1,-0.255497,2,0,7,2,1,0,0
2,-0.085786,0,1,1,1,1,1,0,0,0,...,-1.024179,1,1.441806,2,0,3,2,1,0,0
3,-0.085786,0,1,1,1,1,1,0,0,0,...,-0.445132,1,0.361704,2,1,7,3,1,0,0
4,0.025231,0,1,1,0,1,1,0,0,0,...,-0.311506,1,1.904706,3,2,7,3,1,0,0


In [42]:
for i in df.keys():
    print(i, ": ",df[i].unique())

age :  [-1.19596124 -0.52985625 -0.08578625  0.02523125  0.13624875  0.24726625
  0.35828375  0.46930125  0.58031875  0.69133625  0.80235374  1.02438874
  1.13540624 -3.52732873 -3.41631123 -2.30613623 -1.97308374 -1.75104874
 -1.30697874 -1.08494374 -0.97392624 -0.64087375 -0.41883875 -0.30782125
 -0.19680375  0.91337124  1.24642374  1.35744124  1.46845874  1.57947624
  2.35659873]
sex :  [0 1]
locality :  [1 0]
marital_status :  [1 0]
sleep :  [0 1]
depression :  [1 0]
hyperlipi :  [1 0]
smoking :  [0 1]
family_history :  [0 1]
f_history :  [0 1]
diabetes :  [1 0]
hypertension :  [0 1]
allergies :  [0 1]
blood_pressure :  [-0.75122061 -1.17888203 -0.746944    1.79336485  0.96370169  0.11265546
  0.54031688  1.82757776  0.97225492 -0.31928258  3.07677677  1.81902453
  1.39991634 -1.61082007  0.11693207  0.5445935   0.10837885]
thrombolysis :  [0 1]
blood_glucose_random :  [-1.02323899 -0.66071107 -0.58251877 -1.0161306  -0.44035096 -0.93793831
 -0.70336141 -0.7886621  -0.45456774 -0.1

## Now cleaned data to processed folder

In [43]:
df.to_csv('../data/processed/heart_dataset_3_processed.csv', index=False)