## 

# Heart disease prediction

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from ydata_profiling import ProfileReport
#Profile report takes very long to generate change the bool below to allow for report generation
togeneratereport = True
sb.set() # set the default Seaborn style for graphics


In [None]:
heartdata = pd.read_excel("Heart.xlsx")
heartdata.head()

In [None]:
heartdata.info()

In [None]:
if togeneratereport:
    profile = ProfileReport(heartdata, title="Profiling Report Before Data Cleaning")
    profile.to_notebook_iframe()

### Data Cleaning 

In [None]:
#drop irrelevant columns - Name, Treatments 
heartdata = heartdata.drop(columns=['Name', 'Treatment'])

In [None]:
heartdata.info()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
heartdata.describe()

In [None]:
#Remove Outliers for Age

def remove_outliers(df, df_col): 
    q1 = np.percentile(df_col, 25)
    q3 = np.percentile(df_col, 75)
    iqr = q3-q1
    low_bound = q1-(1.5*iqr)
    upp_bound = q3+(1.5*iqr)
    df=df[(df_col>=low_bound)&(df_col<=upp_bound)]
    return df

In [None]:
heartdata = remove_outliers(heartdata, heartdata["Age"])
heartdata.describe()

In [None]:
#Change "Heart Disease" column from string to numerical categorical data 
heartdata["Heart Disease"] = heartdata["Heart Disease"].map({"Absence":0, "Presence":1})
heartdata["Heart Disease"].value_counts()

In [None]:
heartdata["Blood culture"].value_counts()

In [None]:
"""
Sort strings into categorical data, categorize by type of bacteria: None(0), Staphylococcus(1), Streptococcus(2), 
Candida(3), Other(4)
"""
def blood_culture(data): 
    if data=='None': 
        return 0
    elif 'Staphylococcus' in data: 
        return 1
    elif 'Streptococcus' in data: 
        return 2 
    elif 'Candida' in data: 
        return 3
    else: 
        return 4 
heartdata["Blood culture"]=heartdata["Blood culture"].map(blood_culture)
heartdata["Blood culture"].value_counts()

In [None]:
heartdata["Echocardiogram"].value_counts() #this data is partially represented in other columns - drop? 

In [None]:
heartdata["EKG"].value_counts() #too many variations - drop col?

In [None]:
heartdata["Cardiac CT"].value_counts()

In [None]:
heartdata["Chest x-ray"].value_counts() #too many empty, drop col or assume normal lung structure? 

In [None]:
heartdata["Previous illnesses"].value_counts() #change to 1s and 0s 

In [None]:
def previous_illnesses(data): 
    if data=='None': 
        return 0
    else:
        return 1
heartdata["Previous illnesses"]=heartdata["Previous illnesses"].map(previous_illnesses)
heartdata["Previous illnesses"].value_counts()

In [None]:
heartdata["Pulmonary function tests"].value_counts() #too many empty, drop col

In [None]:
heartdata["Spirometry"].value_counts() #too many empty, drop col

In [None]:
heartdata.describe()

In [None]:
def stenosis(data): 
    if data["Mitral stenosis"]==1: 
        return 1
    elif data["Aortic stenosis"]==1: 
        return 1 
    elif data["Tricuspid stenosis"]==1:
        return 1 
    elif data["Pulmonary stenosis"]==1: 
        return 1
    else: 
        return 0
heartdata["Stenosis"]=heartdata.apply(lambda data:stenosis(data), axis=1)

In [None]:
def cardiomyopathy(data): 
    if data["Dilated cardiomyopathy"]==1: 
        return 1
    elif data["Hypertrophic cardiomyopathy"]==1: 
        return 1 
    elif data["Restrictive cardiomyopathy"]==1:
        return 1 
    elif data["Arrhythmogenic right ventricular cardiomyopathy"]==1: 
        return 1
    elif data["Takotsubo cardiomyopathy"]==1: 
        return 1
    else: 
        return 0
heartdata["Cardiomyopathy"]=heartdata.apply(lambda data:cardiomyopathy(data), axis=1)

In [None]:
heartdata.describe()

In [None]:
heartdata.head()

### EDA

In [None]:
if togeneratereport:
    profile = ProfileReport(heartdata, title="Profiling Report")
    profile.to_notebook_iframe()

In [None]:
dataNum = heartdata.select_dtypes(include = np.int64)

In [None]:
chestpain = pd.DataFrame(dataNum['Chest pain'])
f = plt.figure(figsize=(24, 12))
sb.histplot(data=chestpain, bins=2)

In [None]:
binary_vars = []
for col in heartdata.columns:
    if heartdata[col].nunique() == 2:
        binary_vars.append(col)
corr_matrix = heartdata[binary_vars + ['Heart Disease']].corr()
f, ax = plt.subplots(figsize=(30, 30))
sb.heatmap(corr_matrix, cmap='coolwarm', annot=True,ax=ax)

# Create subplots for each binary variable
fig, axes = plt.subplots(nrows=len(binary_vars), figsize=(8, 6*len(binary_vars)))
for i, var in enumerate(binary_vars):
    sb.countplot(x=var, hue='Heart Disease', data=heartdata, ax=axes[i])
    axes[i].set_xlabel(var)
    axes[i].set_ylabel('Count')
    axes[i].legend(title='Heart Disease', loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
numeric_cols = []
for col in heartdata.columns:
    if heartdata[col].nunique() > 2:
        if heartdata[col].dtype == 'int64':
            numeric_cols.append(col)

# Generate boxplot for each numeric column by presence/absence of heart disease
for col in numeric_cols:
    plt.figure()
    sb.boxplot(x='Heart Disease', y=col, data=heartdata)
    plt.title(col)
    plt.show()

In [None]:
medications = heartdata['Medications'].str.get_dummies(sep=', ')


data2 = pd.concat([heartdata[binary_vars], medications], axis=1)
corr_matrix = data2.corr()
mask = corr_matrix.abs() > 0.5

# Plot heatmap with masked values
f, ax = plt.subplots(figsize=(30, 30))
sb.heatmap(corr_matrix[mask], cmap='coolwarm', annot=True, ax=ax)

In [None]:
corr_indices = np.where(np.abs(corr_matrix) > 0.5)

# Iterate over the indices and print out the correlations and the corresponding variable pairs
for i, j in zip(*corr_indices):
    if i != j and 0.5<corr_matrix.iloc[i, j]<1:
        print(f"Correlation of {corr_matrix.iloc[i, j]:.2f} between {corr_matrix.columns[i]} and {corr_matrix.columns[j]}")

### Creating Training and test Data

In [None]:
heartdata_clean =  heartdata.drop(columns=['Gender','Echocardiogram','EKG','Cardiac CT','Chest x-ray','Pulmonary function tests','Spirometry','Medications'],axis=1)
target =  heartdata_clean['Heart Disease']
data =  heartdata_clean.drop('Heart Disease',axis=1)
data.shape

### Extra processing for Logistic Regression, KNN and Random forest

In [None]:
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(data)
data =  standardScaler.transform(data)


from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(data,target,random_state=3)
train_X.shape

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg= LogisticRegression()
log_reg.fit(train_X,train_y)
log_pred_y = log_reg.predict(test_X)
log_reg.score(train_X,train_y)
log_reg.score(test_X,test_y)
from sklearn.metrics import accuracy_score
accuracy_score(test_y,log_pred_y)

#### Use grid search to find better parameters

In [None]:
from sklearn.model_selection import GridSearchCV
param_test =  {'penalty':['l2','l1'],
                'C':[0.01,0.1,1.0,10,100],
                'class_weight':[None,'balanced']}
log_gv =  GridSearchCV(estimator=log_reg,param_grid=param_test,cv=5)
log_gv.fit(train_X,train_y)


log_gv.best_params_


log_gv.score(train_X,train_y)


log_gv.score(test_X,test_y)


log_pred_y =  log_gv.predict(test_X)


from sklearn.metrics import classification_report

print(classification_report(test_y,log_pred_y))
from sklearn.metrics import confusion_matrix





cm = confusion_matrix(test_y, log_pred_y)


sb.heatmap(cm, annot=True, cmap='Blues')

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn =  KNeighborsClassifier()

#训练数据
knn.fit(train_X,train_y)

# 预测数据
knn_pred_y = knn.predict(test_X)

# 评估模型
print("Training score:", knn.score(train_X,train_y))

print("Test score: ", knn.score(test_X,test_y))

print("Accuracy score: ", accuracy_score(test_y,knn_pred_y))

#### Use grid search to find better parameters

In [None]:
from sklearn.model_selection import GridSearchCV
knn =  KNeighborsClassifier()
param_test =  [
    {'n_neighbors':[i for i in range(1,31)],
    'weights':['uniform']},

    {'n_neighbors':[i for i in range(1,21)],
    'weights':['distance'],
    'p':[i for i in range(1,6)]}
]
knn_gv = GridSearchCV(estimator = knn,param_grid=param_test,cv=5)
print(knn_gv.fit(train_X,train_y))
print(knn_gv.best_params_)

knn_pred_y = knn_gv.predict(test_X)
print(classification_report(test_y,knn_pred_y))

In [None]:
cm2 = confusion_matrix(test_y, knn_pred_y)


sb.heatmap(cm2, annot=True, cmap='Blues')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf =  RandomForestClassifier(oob_score=True)


rf_clf.fit(train_X,train_y)


pred_rf =  rf_clf.predict(test_X)


print("Train score:", rf_clf.score(train_X, train_y))

print("Test score:", rf_clf.score(test_X,test_y))

print("Accuracy Score:",accuracy_score(test_y,pred_rf))

print("OOB Score:", rf_clf.oob_score_)


#### Use grid search to find better parameters

In [None]:
param_test={
        'n_estimators':[100,300,500],
        'max_leaf_nodes':[10,12,14,16]
}

rf_gv =  GridSearchCV(estimator=rf_clf,param_grid=param_test,cv=5)
rf_gv.fit(train_X,train_y)


print(rf_gv.best_params_)



pred_rf =  rf_gv.predict(test_X)

print(rf_gv.score(test_X,test_y))

In [None]:
cm3 = confusion_matrix(test_y, pred_rf)


sb.heatmap(cm3, annot=True, cmap='Blues')