In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Dataset Overview

## Heart Disease Cleveland UCI

https://www.kaggle.com/datasets/cherngs/heart-disease-cleveland-uci?select=heart_cleveland_upload.csv

There are 13 attributes

1. age: age in years
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type
* Value 0: typical angina
* Value 1: atypical angina
* Value 2: non-anginal pain
* Value 3: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
* Value 0: upsloping
* Value 1: flat
* Value 2: downsloping
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
14. condition: 0 = no disease, 1 = disease

In [2]:
df_cleveland = pd.read_csv('../../Data/UCL_ML-Respo/heart_cleveland_upload.csv')
df_cleveland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [3]:
df_cleveland.shape

(297, 14)

In [4]:
df_cleveland.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition'],
      dtype='object')

In [5]:
df_cleveland.rename(columns={'age': 'Age', 'sex': 'Sex', 'cp': 'ChestPainType', 'trestbps': 'RestingBP', 'chol': 'Cholesterol', 'fbs': 'FastingBS', 
                             'restecg': 'RestingECG', 'thalach': 'MaxHR', 'exang': 'ExerciseAngina', 'oldpeak': 'Oldpeak', 'slope':'ST_Slope', 
                             'condition': 'HeartDisease'}, 
                     inplace=True)
df_cleveland.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,ca,thal,HeartDisease
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [6]:
df_cleveland = df_cleveland.drop(['ca', 'thal'], axis=1)
df_cleveland.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,69,1,0,160,234,1,2,131,0,0.1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0


In [7]:
df_cleveland.shape

(297, 12)

----
## Heart Failure Prediction Dataset

https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

1. Age: age of the patient [years]
2. Sex: sex of the patient [M: Male, F: Female]
3. ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
4. RestingBP: resting blood pressure [mm Hg]
5. Cholesterol: serum cholesterol [mm/dl]
6. FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
7. RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
8. MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
9. ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
10. Oldpeak: oldpeak = ST [Numeric value measured in depression]
11. ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
12. HeartDisease: output class [1: heart disease, 0: Normal]

In [8]:
df_failure = pd.read_csv('../../Data/UCL_ML-Respo/heart_failure_dataset.csv')
df_failure.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [9]:
df_failure.shape

(918, 12)

In [10]:
df_failure.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [11]:
df1 = df_failure.copy()
df1.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [12]:
df1['Sex'][df1['Sex'] == 'F'] = 0
df1['Sex'][df1['Sex'] == 'M'] = 1 

df1['ChestPainType'][df1['ChestPainType'] == 'TA'] = 0
df1['ChestPainType'][df1['ChestPainType'] == 'ATA'] = 1
df1['ChestPainType'][df1['ChestPainType'] == 'NAP'] = 2
df1['ChestPainType'][df1['ChestPainType'] == 'ASY'] = 3

df1['RestingECG'][df1['RestingECG'] == 'Normal'] = 0
df1['RestingECG'][df1['RestingECG'] == 'ST'] = 1
df1['RestingECG'][df1['RestingECG'] == 'LVH'] = 2

df1['ExerciseAngina'][df1['ExerciseAngina'] == 'N'] = 0
df1['ExerciseAngina'][df1['ExerciseAngina'] == 'Y'] = 1

df1['ST_Slope'][df1['ST_Slope'] == 'Up'] = 0
df1['ST_Slope'][df1['ST_Slope'] == 'Flat'] = 1
df1['ST_Slope'][df1['ST_Slope'] == 'Down'] = 2

In [13]:
df1.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,0,172,0,0.0,0,0
1,49,0,2,160,180,0,0,156,0,1.0,1,1
2,37,1,1,130,283,0,1,98,0,0.0,0,0
3,48,0,3,138,214,0,0,108,1,1.5,1,1
4,54,1,2,150,195,0,0,122,0,0.0,0,0


In [14]:
df1.shape

(918, 12)

## Heart Disease Dataset

https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset?select=heart.csv

1. age
2. sex (1 = male; 0 = female)
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect

In [15]:
df_disease = pd.read_csv('../../Data/UCL_ML-Respo/heart_disease_dataset.csv')
df_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [16]:
df_disease.shape

(1025, 14)

In [17]:
df_disease.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [18]:
df_disease.rename(columns={'age': 'Age', 'sex': 'Sex', 'cp': 'ChestPainType', 'trestbps': 'RestingBP', 'chol': 'Cholesterol', 'fbs': 'FastingBS', 
                           'restecg': 'RestingECG', 'thalach': 'MaxHR', 'exang': 'ExerciseAngina', 'oldpeak': 'Oldpeak', 'slope':'ST_Slope', 
                           'target': 'HeartDisease'}, 
                     inplace=True)
df_disease.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,ca,thal,HeartDisease
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [19]:
df_disease = df_disease.drop(['ca', 'thal'], axis=1)

In [20]:
df_disease.shape

(1025, 12)

# Joining the 3 dataset to form a new one

In [21]:
df_cleveland.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,69,1,0,160,234,1,2,131,0,0.1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0


In [22]:
df1.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,0,172,0,0.0,0,0
1,49,0,2,160,180,0,0,156,0,1.0,1,1
2,37,1,1,130,283,0,1,98,0,0.0,0,0


In [23]:
df_disease.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,52,1,0,125,212,0,1,168,0,1.0,2,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0


In [24]:
# Concatenate the three dataframes vertically
combined_df = pd.concat([df_cleveland, df1, df_disease], ignore_index=True)
combined_df.shape

(2240, 12)

In [25]:
combined_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,69,1,0,160,234,1,2,131,0,0.1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0


In [26]:
combined_df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [29]:
# Save the combined dataframe to a new CSV file
combined_df.to_csv('../../Data/UCL_ML-Respo/UCLCombined_dataset.csv', index=False)