# HEART DISEASE PREDICTION


#### 1. lMPORT REQUIRED LIBRARIES

Load Required Libraries

In [313]:
# import useful libraries from installed packages

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go   
import plotly.io as pio
pio.renderers.default = "browser"  # Set default renderer to browser
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Set default style for seaborn
sns.set(style="whitegrid")


#### 2 DATA INVESTIGATION

 Extraction, Transform and Load The Datasets

In [314]:
# Load datasets from CSV files into pandas DataFrames
df = pd.read_csv("../datasets/heart_disease.csv") 
# Load the second dataset into a DataFrame
df2 = pd.read_csv("../datasets/heart_disease_integrated.csv")

#### 3. DATA EXPLORATION

In [315]:
# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [316]:
df.tail()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
1185,45,1,1,110,264,0,0,132,0,1.2,2,1
1186,68,1,4,144,193,1,0,141,0,3.4,2,1
1187,57,1,4,130,131,0,0,115,1,1.2,2,1
1188,57,0,2,130,236,0,2,174,0,0.0,2,1
1189,38,1,3,138,175,0,0,173,0,0.0,1,0


In [317]:
# Displays the shape attribute of the DataFrame
df.shape

(1190, 12)

In [318]:
# Display concise summary of the DataFrame 'df'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [319]:
# Display the columns of the DataFrame df
df.columns

Index(['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
       'fasting blood sugar', 'resting ecg', 'max heart rate',
       'exercise angina', 'oldpeak', 'ST slope', 'target'],
      dtype='object')

In [320]:
# Explore datatypes of the columns in the DataFrame

df.dtypes

age                      int64
sex                      int64
chest pain type          int64
resting bp s             int64
cholesterol              int64
fasting blood sugar      int64
resting ecg              int64
max heart rate           int64
exercise angina          int64
oldpeak                float64
ST slope                 int64
target                   int64
dtype: object

#### 5. EXPLORATORY DATA ANALYSIS

I. EDA Using Y data profiling

In [321]:
# EDA using ydata profiling
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Heart Disease Dataset Profiling Report", explorative=True)
profile.to_file("heart_disease_profiling_report.html")  # Save the report to an HTML file


Summarize dataset: 100%|██████████| 46/46 [00:04<00:00,  9.81it/s, Completed]                             
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 74.96it/s]


II.  Basic Summary statistics

In [322]:
# Display basic summary statistics of the DataFrame
df.describe()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [323]:
# Include categorical columns in the summary statistics
df.describe(include='category')

ValueError: No objects to concatenate

#### 6. DATA CLEANING AND NORMALISATION

In [326]:
# explore the columns in the DataFrame
df.columns

Index(['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
       'fasting blood sugar', 'resting ecg', 'max heart rate',
       'exercise angina', 'oldpeak', 'ST slope', 'target'],
      dtype='object')

Rename Columns

In [327]:
# rename the 'resting bp s' column to 'resting systolic'
df.rename(columns={'resting bp s': 'resting systolic bp'}, inplace=True)
# rename the oldpeak column to ST depression
df.rename(columns={'oldpeak': 'ST depression'}, inplace=True)
# rename the 'resting ecg' column to 'resting ECG'
df.rename(columns={'resting ecg': 'resting ECG'}, inplace=True)
# check the columns again to confirm the changes
df.columns

Index(['age', 'sex', 'chest pain type', 'resting systolic bp', 'cholesterol',
       'fasting blood sugar', 'resting ECG', 'max heart rate',
       'exercise angina', 'ST depression', 'ST slope', 'target'],
      dtype='object')

Confirm First 5 and Last 5 Rows Of The Dataframe df

In [328]:
df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,age,sex,chest pain type,resting systolic bp,cholesterol,fasting blood sugar,resting ECG,max heart rate,exercise angina,ST depression,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [329]:

df.tail()  # Display the last few rows of the DataFrame

Unnamed: 0,age,sex,chest pain type,resting systolic bp,cholesterol,fasting blood sugar,resting ECG,max heart rate,exercise angina,ST depression,ST slope,target
1185,45,1,1,110,264,0,0,132,0,1.2,2,1
1186,68,1,4,144,193,1,0,141,0,3.4,2,1
1187,57,1,4,130,131,0,0,115,1,1.2,2,1
1188,57,0,2,130,236,0,2,174,0,0.0,2,1
1189,38,1,3,138,175,0,0,173,0,0.0,1,0


Fix Or Normalise Categorical Datatypes

In [330]:
# convert to categorical datatypes

df['sex'] = df['sex'].astype('category')
df['chest pain type'] = df['chest pain type'].astype('category')
df['fasting blood sugar'] = df['fasting blood sugar'].astype('category')
df['resting ECG'] = df['resting ECG'].astype('category')
df['exercise angina'] = df['exercise angina'].astype('category')
df['ST slope'] = df['ST slope'].astype('category')
df['target'] = df['target'].astype('category')

# Display the updated DataFrame with new data types
df.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  1190 non-null   int64   
 1   sex                  1190 non-null   category
 2   chest pain type      1190 non-null   category
 3   resting systolic bp  1190 non-null   int64   
 4   cholesterol          1190 non-null   int64   
 5   fasting blood sugar  1190 non-null   category
 6   resting ECG          1190 non-null   category
 7   max heart rate       1190 non-null   int64   
 8   exercise angina      1190 non-null   category
 9   ST depression        1190 non-null   float64 
 10  ST slope             1190 non-null   category
 11  target               1190 non-null   category
dtypes: category(7), float64(1), int64(4)
memory usage: 55.8 KB


Check For Missing Values and Replace Missing Values

In [331]:
# Check for missing values in the DataFrame
df.isnull().sum()

age                    0
sex                    0
chest pain type        0
resting systolic bp    0
cholesterol            0
fasting blood sugar    0
resting ECG            0
max heart rate         0
exercise angina        0
ST depression          0
ST slope               0
target                 0
dtype: int64

Remove Duplicates

In [332]:
# remove duplicates from the DataFrame
df.drop_duplicates(inplace=True)

In [333]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 918 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  918 non-null    int64   
 1   sex                  918 non-null    category
 2   chest pain type      918 non-null    category
 3   resting systolic bp  918 non-null    int64   
 4   cholesterol          918 non-null    int64   
 5   fasting blood sugar  918 non-null    category
 6   resting ECG          918 non-null    category
 7   max heart rate       918 non-null    int64   
 8   exercise angina      918 non-null    category
 9   ST depression        918 non-null    float64 
 10  ST slope             918 non-null    category
 11  target               918 non-null    category
dtypes: category(7), float64(1), int64(4)
memory usage: 50.3 KB


#### 7. DATA AUGMENTATION

Explore Second Datasets df2, Rename Required Columns and Add Required Columns to First Datasets (df)

In [334]:
# Note Key provided manually by the data collector and will be added to the ReadMe file
# Load the second dataset into a DataFrame and display information about it
df2 = pd.read_csv("../datasets/heart_disease_integrated.csv")
df2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 76 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       898 non-null    int64  
 1   0       898 non-null    int64  
 2   63      898 non-null    int64  
 3   1.1     898 non-null    int64  
 4   -9      898 non-null    int64  
 5   -9.1    898 non-null    int64  
 6   -9.2    898 non-null    int64  
 7   -9.3    898 non-null    int64  
 8   1.2     898 non-null    int64  
 9   145     898 non-null    int64  
 10  1.3     898 non-null    int64  
 11  233     898 non-null    int64  
 12  -9.4    898 non-null    int64  
 13  50      898 non-null    int64  
 14  20      898 non-null    int64  
 15  1.4     898 non-null    int64  
 16  -9.5    898 non-null    int64  
 17  1.5     898 non-null    int64  
 18  2       898 non-null    int64  
 19  2.1     898 non-null    int64  
 20  3       898 non-null    int64  
 21  81      898 non-null    int64  
 22  0.

In [335]:
# explore the shape of the DataFrame DF2
df2.shape

(898, 76)

Rename Two Useful Columns For This Project

In [336]:
# rename columns from the second DataFrame 
df2.rename(columns={
    '1.3': 'hypertension',
    '1.5': 'family history'
}, inplace=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 76 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   1               898 non-null    int64  
 1   0               898 non-null    int64  
 2   63              898 non-null    int64  
 3   1.1             898 non-null    int64  
 4   -9              898 non-null    int64  
 5   -9.1            898 non-null    int64  
 6   -9.2            898 non-null    int64  
 7   -9.3            898 non-null    int64  
 8   1.2             898 non-null    int64  
 9   145             898 non-null    int64  
 10  hypertension    898 non-null    int64  
 11  233             898 non-null    int64  
 12  -9.4            898 non-null    int64  
 13  50              898 non-null    int64  
 14  20              898 non-null    int64  
 15  1.4             898 non-null    int64  
 16  -9.5            898 non-null    int64  
 17  family history  898 non-null    int

Add Renamed Columns To First Datasets df

In [337]:
# add columns 'hypertension', 'family history' from the second DataFrame to the first DataFrame
min_len = min(len(df), len(df2))
df.loc[:, 'hypertension'] = np.nan
df.loc[:, 'family history'] = np.nan
df.iloc[:min_len, df.columns.get_loc('hypertension')] = df2['hypertension'].values[:min_len]
df.iloc[:min_len, df.columns.get_loc('family history')] = df2['family history'].values[:min_len]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 918 entries, 0 to 1189
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  918 non-null    int64   
 1   sex                  918 non-null    category
 2   chest pain type      918 non-null    category
 3   resting systolic bp  918 non-null    int64   
 4   cholesterol          918 non-null    int64   
 5   fasting blood sugar  918 non-null    category
 6   resting ECG          918 non-null    category
 7   max heart rate       918 non-null    int64   
 8   exercise angina      918 non-null    category
 9   ST depression        918 non-null    float64 
 10  ST slope             918 non-null    category
 11  target               918 non-null    category
 12  hypertension         898 non-null    float64 
 13  family history       898 non-null    float64 
dtypes: category(7), float64(3), int64(4)
memory usage: 64.7 KB


In [338]:
# convert float to categorical datatypes of newly added columns
df['hypertension'] = df['hypertension'].astype('category')
df['family history'] = df['family history'].astype('category')
df['ST depression'] = df['ST depression'].astype('category')
# Display the updated DataFrame with new data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 918 entries, 0 to 1189
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  918 non-null    int64   
 1   sex                  918 non-null    category
 2   chest pain type      918 non-null    category
 3   resting systolic bp  918 non-null    int64   
 4   cholesterol          918 non-null    int64   
 5   fasting blood sugar  918 non-null    category
 6   resting ECG          918 non-null    category
 7   max heart rate       918 non-null    int64   
 8   exercise angina      918 non-null    category
 9   ST depression        918 non-null    category
 10  ST slope             918 non-null    category
 11  target               918 non-null    category
 12  hypertension         898 non-null    category
 13  family history       898 non-null    category
dtypes: category(10), int64(4)
memory usage: 48.6 KB


Check For Missing Values

In [339]:
# check for missing values again
df.isnull().sum()

age                     0
sex                     0
chest pain type         0
resting systolic bp     0
cholesterol             0
fasting blood sugar     0
resting ECG             0
max heart rate          0
exercise angina         0
ST depression           0
ST slope                0
target                  0
hypertension           20
family history         20
dtype: int64

Replace Missing Values and Check Again

In [340]:
# replace missing values in 'hypertension' and 'family history' with with mode of the respective columns
df['hypertension'].fillna(df['hypertension'].mode()[0], inplace=True)
df['family history'].fillna(df['family history'].mode()[0], inplace=True)

In [341]:
# check missing values again
df.isnull().sum()

age                    0
sex                    0
chest pain type        0
resting systolic bp    0
cholesterol            0
fasting blood sugar    0
resting ECG            0
max heart rate         0
exercise angina        0
ST depression          0
ST slope               0
target                 0
hypertension           0
family history         0
dtype: int64

Convert Newly Added Columns To Categorical DataTypes

In [342]:
# convert float to categorical datatypes of newly added columns
df['hypertension'] = df['hypertension'].astype('category')
df['family history'] = df['family history'].astype('category')
# Display the updated DataFrame with new data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 918 entries, 0 to 1189
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  918 non-null    int64   
 1   sex                  918 non-null    category
 2   chest pain type      918 non-null    category
 3   resting systolic bp  918 non-null    int64   
 4   cholesterol          918 non-null    int64   
 5   fasting blood sugar  918 non-null    category
 6   resting ECG          918 non-null    category
 7   max heart rate       918 non-null    int64   
 8   exercise angina      918 non-null    category
 9   ST depression        918 non-null    category
 10  ST slope             918 non-null    category
 11  target               918 non-null    category
 12  hypertension         918 non-null    category
 13  family history       918 non-null    category
dtypes: category(10), int64(4)
memory usage: 48.6 KB


8. DATA VISUALISATION

In [343]:
# Numeric columns visualization
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
# Display the numeric columns
numeric_columns

['age', 'resting systolic bp', 'cholesterol', 'max heart rate']

In [346]:
# Visualize age distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True, bins=30)
plt.title('Distribution of Age')
plt.show()