## 

# Heart disease prediction

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from ydata_profiling import ProfileReport
#Profile report takes very long to generate change the bool below to allow for report generation
togeneratereport = True
sb.set() # set the default Seaborn style for graphics


In [None]:
heartdata = pd.read_excel("Heart.xlsx")
heartdata.head()

In [None]:
heartdata.info()

In [None]:
if togeneratereport:
    profile = ProfileReport(heartdata, title="Profiling Report Before Data Cleaning")
    profile.to_notebook_iframe()

### Data Cleaning 

In [None]:
#drop irrelevant columns - Name, Treatments 
heartdata = heartdata.drop(columns=['Name', 'Treatment'])

In [None]:
heartdata.info()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
heartdata.describe()

In [None]:
#Remove Outliers for Age

def remove_outliers(df, df_col): 
    q1 = np.percentile(df_col, 25)
    q3 = np.percentile(df_col, 75)
    iqr = q3-q1
    low_bound = q1-(1.5*iqr)
    upp_bound = q3+(1.5*iqr)
    df=df[(df_col>=low_bound)&(df_col<=upp_bound)]
    return df

In [None]:
heartdata = remove_outliers(heartdata, heartdata["Age"])
heartdata.describe()

In [None]:
#Change "Heart Disease" column from string to numerical categorical data 
heartdata["Heart Disease"] = heartdata["Heart Disease"].map({"Absence":0, "Presence":1})
heartdata["Heart Disease"].value_counts()

In [None]:
heartdata["Gender"] = heartdata["Gender"].map({"Male":0, "Female":1})

In [None]:
heartdata["Blood culture"].value_counts()

In [None]:
"""
Sort strings into categorical data, categorize by type of bacteria: None(0), Staphylococcus(1), Streptococcus(2), 
Candida(3), Other(4)
"""
def blood_culture(data): 
    if data=='None': 
        return 0
    elif 'Staphylococcus' in data: 
        return 1
    elif 'Streptococcus' in data: 
        return 2 
    elif 'Candida' in data: 
        return 3
    else: 
        return 4 
heartdata["Blood culture"]=heartdata["Blood culture"].map(blood_culture)
heartdata["Blood culture"].value_counts()

In [None]:
heartdata["Echocardiogram"].value_counts() #this data is partially represented in other columns - drop? 

In [None]:
heartdata["EKG"].value_counts() #too many variations - drop col?

In [None]:
heartdata["Cardiac CT"].value_counts()

In [None]:
heartdata["Chest x-ray"].value_counts() #too many empty, drop col or assume normal lung structure? 

In [None]:
heartdata["Previous illnesses"].value_counts() #change to 1s and 0s 

In [None]:
def previous_illnesses(data): 
    if data=='None': 
        return 0
    else:
        return 1
heartdata["Previous illnesses"]=heartdata["Previous illnesses"].map(previous_illnesses)
heartdata["Previous illnesses"].value_counts()

In [None]:
heartdata["Pulmonary function tests"].value_counts() #too many empty, drop col

In [None]:
heartdata["Spirometry"].value_counts() #too many empty, drop col

In [None]:
heartdata.describe()

In [None]:
def stenosis(data): 
    if data["Mitral stenosis"]==1: 
        return 1
    elif data["Aortic stenosis"]==1: 
        return 1 
    elif data["Tricuspid stenosis"]==1:
        return 1 
    elif data["Pulmonary stenosis"]==1: 
        return 1
    else: 
        return 0
heartdata["Stenosis"]=heartdata.apply(lambda data:stenosis(data), axis=1)

In [None]:
def cardiomyopathy(data): 
    if data["Dilated cardiomyopathy"]==1: 
        return 1
    elif data["Hypertrophic cardiomyopathy"]==1: 
        return 1 
    elif data["Restrictive cardiomyopathy"]==1:
        return 1 
    elif data["Arrhythmogenic right ventricular cardiomyopathy"]==1: 
        return 1
    elif data["Takotsubo cardiomyopathy"]==1: 
        return 1
    else: 
        return 0
heartdata["Cardiomyopathy"]=heartdata.apply(lambda data:cardiomyopathy(data), axis=1)

In [None]:
heartdata.describe()

In [None]:
#drop all non-binary variables
heartdata.info()

In [None]:
cat_heartdata = heartdata.drop(heartdata.columns[[1, 5, 6, 7, 9, 10, 11, 19, 20, 21, 22, 31, 32, 
                                                  33, 34, 35, 36, 37, 38, 39, 41, 43, 44]], axis=1)
cat_heartdata.info()

In [None]:
x = cat_heartdata.loc[:, cat_heartdata.columns != "Heart Disease"]
#print(x.describe())
y = pd.DataFrame(cat_heartdata["Heart Disease"]) 

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.20
)

In [None]:
from sklearn.naive_bayes import BernoulliNB
BNBclf = BernoulliNB()
BNBclf.fit(x_train, np.ravel(y_train))

In [None]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)


y_train_pred = BNBclf.predict(x_train) 
y_test_pred = BNBclf.predict(x_test)

print("Train accuracy\t:", accuracy_score(y_train_pred, y_train)) 
print("Test accuracy\t:", accuracy_score(y_test_pred, y_test)) 

"""
print("Train score\t:" + int(BNBclf.score(x_train, y_train))) 
print("Test score\t:" + int(BNBclf.score(x_test, y_test))) 
"""

#confusion matrix for train data 
labels=[0,1]
cm = confusion_matrix(y_train, y_train_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot();

#confusion matrix for test data 
labels=[0,1]
cm = confusion_matrix(y_test, y_test_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot();

In [None]:
correlations = cat_heartdata.corr()

# plot the heatmap 
plt.figure(figsize=(40,40))
sb.heatmap(correlations, xticklabels=correlations.columns, yticklabels=correlations.columns, annot=True)

In [None]:
cont_heartdata = heartdata.iloc[:,[1, 5, 6, 7, 9, 10, 11, 45]] 
cont_heartdata.info()

In [None]:
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["Age"])
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["Systolic"])
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["Diastolic"])
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["Heart rate (bpm)"])
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["Cholesterol level (mg/dL)"])
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["LDL level (mg/dL)"])
cont_heartdata = remove_outliers(cont_heartdata, cont_heartdata["HDL level (mg/dL)"])
cont_heartdata.info()

In [None]:
x = cont_heartdata.drop(columns=["Heart Disease"])
print(x.info())
y = pd.DataFrame(cont_heartdata["Heart Disease"])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.20
)

In [None]:
from sklearn.naive_bayes import GaussianNB
GNBclf = GaussianNB()
GNBclf.fit(x_train, np.ravel(y_train))

In [None]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)


y_train_pred = GNBclf.predict(x_train) 
y_test_pred = GNBclf.predict(x_test)

print("Train accuracy\t:", accuracy_score(y_train_pred, y_train)) 
print("Test accuracy\t:", accuracy_score(y_test_pred, y_test)) 

"""
print("Train score\t:" + int(BNBclf.score(x_train, y_train))) 
print("Test score\t:" + int(BNBclf.score(x_test, y_test))) 
"""

#confusion matrix for train data 
labels=[0,1]
cm = confusion_matrix(y_train, y_train_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot();

#confusion matrix for test data 
labels=[0,1]
cm = confusion_matrix(y_test, y_test_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot();

In [None]:
### EDA

In [None]:
if togeneratereport:
    profile = ProfileReport(heartdata, title="Profiling Report")
    profile.to_notebook_iframe()

In [None]:
dataNum = heartdata.select_dtypes(include = np.int64)

In [None]:
chestpain = pd.DataFrame(dataNum['Chest pain'])
f = plt.figure(figsize=(24, 12))
sb.histplot(data=chestpain, bins=2)

In [None]:
binary_vars = []
for col in heartdata.columns:
    if heartdata[col].nunique() == 2:
        binary_vars.append(col)
corr_matrix = heartdata[binary_vars + ['Heart Disease']].corr()
f, ax = plt.subplots(figsize=(30, 30))
sb.heatmap(corr_matrix, cmap='coolwarm', annot=True,ax=ax)

# Create subplots for each binary variable
fig, axes = plt.subplots(nrows=len(binary_vars), figsize=(8, 6*len(binary_vars)))
for i, var in enumerate(binary_vars):
    sb.countplot(x=var, hue='Heart Disease', data=heartdata, ax=axes[i])
    axes[i].set_xlabel(var)
    axes[i].set_ylabel('Count')
    axes[i].legend(title='Heart Disease', loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
numeric_cols = []
for col in heartdata.columns:
    if heartdata[col].nunique() > 2:
        if heartdata[col].dtype == 'int64':
            numeric_cols.append(col)

# Generate boxplot for each numeric column by presence/absence of heart disease
for col in numeric_cols:
    plt.figure()
    sb.boxplot(x='Heart Disease', y=col, data=heartdata)
    plt.title(col)
    plt.show()

In [None]:
medications = heartdata['Medications'].str.get_dummies(sep=', ')


data2 = pd.concat([heartdata[binary_vars], medications], axis=1)
corr_matrix = data2.corr()
mask = corr_matrix.abs() > 0.5

# Plot heatmap with masked values
f, ax = plt.subplots(figsize=(30, 30))
sb.heatmap(corr_matrix[mask], cmap='coolwarm', annot=True, ax=ax)

In [None]:
corr_indices = np.where(np.abs(corr_matrix) > 0.5)

# Iterate over the indices and print out the correlations and the corresponding variable pairs
for i, j in zip(*corr_indices):
    if i != j and 0.5<corr_matrix.iloc[i, j]<1:
        print(f"Correlation of {corr_matrix.iloc[i, j]:.2f} between {corr_matrix.columns[i]} and {corr_matrix.columns[j]}")