# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder


# Loading Data

In [None]:
file_path = "heart_dataset.csv"
data = pd.read_csv(file_path)
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

### Splitting data by types

In [None]:
categ_cols = data.select_dtypes(include='object').columns
data[categ_cols] = data[categ_cols].astype('category')

In [None]:
categ_cols = data.select_dtypes(include='category').columns
categ_cols

In [None]:
num_cols = data.columns.to_list()

for col in categ_cols:
    num_cols.remove(col)
num_cols.remove('HeartDisease')
num_cols

# EDA

### Feature Exploration

In [None]:
for col in categ_cols:
    print(f"The distribution of categorical values in '{col}' is:")
    print(data[col].value_counts())
    print("--------------------------------------------------------------")

### Feature Distribution

In [None]:
sns.countplot(data=data, x='HeartDisease', hue='Gender')
plt.xlabel('HeartDisease')
plt.ylabel('Count')
plt.title('Distribution of Heart Disease by Gender')
plt.show()


In [None]:
sns.countplot(data=data, x='ChestPainType', hue='Gender')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.title('Types of Chest Pain')
plt.show()


In [None]:
sns.countplot(data=data, x='Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Ratio in Data')
plt.show()


In [None]:
sns.countplot(data=data, x='RestingECG')
plt.xlabel('Resting ECG')
plt.ylabel('Count')
plt.title('Distribution of Resting ECG')
plt.show()


In [None]:
plt.figure(figsize=(30, 20))

for i, col in enumerate(data.columns, 1):
    plt.subplot(4, 3, i)
    sns.histplot(data[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.plot()

# Feature Engineering


In [None]:
data[categ_cols].head()

#### Label Encoder for Tree Models

In [None]:
data_tree = data.copy()
le = LabelEncoder()

In [None]:
for col in categ_cols:
    data_tree[col] = le.fit_transform(data_tree[col])

In [None]:
data_tree = data.apply(LabelEncoder().fit_transform)

In [None]:
data_tree.head()

#### One-Hot Encoding for non-Tree Models

In [None]:
data_non_tree = data.copy()
data_non_tree.head()

In [None]:
data_non_tree = pd.get_dummies(data_non_tree, columns=categ_cols, drop_first=False)
data_non_tree.head()

In [None]:
target = "HeartDisease"
y = data_non_tree["HeartDisease"].values

In [None]:
data_non_tree.drop("HeartDisease", axis=1, inplace=True)
data_non_tree = pd.concat([data_non_tree, data[target]], axis=1)
data_non_tree.head()

# Modeling

## Non Tree Models

In [None]:
feature_col_non_tree = data_non_tree.columns.to_list()
feature_col_non_tree.remove(target)

### Logistic Regression

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

acc_log = []

kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (trn_, val_) in enumerate(kf.split(X=data_non_tree[feature_col_non_tree], y=y)):

    X_train = data_non_tree.loc[trn_, feature_col_non_tree]
    y_train = data_non_tree.loc[trn_, target]

    X_val = data_non_tree.loc[val_, feature_col_non_tree]
    y_val = data_non_tree.loc[val_, target]

    ro_scaler = RobustScaler()
    X_train = ro_scaler.fit_transform(X_train)
    X_val = ro_scaler.transform(X_val)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(f"The fold is {fold}:")
    print(classification_report(y_val, y_pred))

    acc = accuracy_score(y_val, y_pred)
    acc_log.append(acc)
    print(f"Fold: {fold+1}, Accuracy: {acc}")
    print("--------------------------------------------------------------")

    pass

In [None]:
acc_log

### KNN K-Nearest Neighbors

In [None]:
## Using RBF kernel
from sklearn.neighbors import KNeighborsClassifier
acc_knn=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=data_non_tree,y=y)):

    X_train=data_non_tree.loc[trn_,feature_col_non_tree]
    y_train=data_non_tree.loc[trn_,target]

    X_valid=data_non_tree.loc[val_,feature_col_non_tree]
    y_valid=data_non_tree.loc[val_,target]

    ro_scaler=MinMaxScaler()
    X_train=ro_scaler.fit_transform(X_train)
    X_valid=ro_scaler.transform(X_valid)

    clf=KNeighborsClassifier(n_neighbors=32)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_knn.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")

    pass

In [None]:
acc_knn

## Tree Models

In [None]:
feature_col_tree = data_tree.columns.to_list()
feature_col_tree.remove(target)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

acc_dtree = []
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (trn_, val_) in enumerate(kf.split(X=data_tree[feature_col_tree], y=y)):
    X_train = data_tree.loc[trn_, feature_col_tree]
    y_train = data_tree.loc[trn_, target]

    X_val = data_tree.loc[val_, feature_col_tree]
    y_val = data_tree.loc[val_, target]

    clf = DecisionTreeClassifier(criterion='entropy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(f"The fold is {fold}:")
    print(classification_report(y_val, y_pred))

    acc = accuracy_score(y_val, y_pred)
    acc_dtree.append(acc)
    print(f"Fold: {fold+1}, Accuracy: {acc}")
    print("--------------------------------------------------------------")

    pass

In [None]:
acc_dtree

In [None]:
from sklearn import tree
import graphviz

dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=feature_col_tree,
                                class_names=['No Disease', 'Disease'],
                                filled=True)

In [None]:
graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Save the tree as a PDF file
graph

In [None]:
#visualize the decision tree model
from sklearn import tree

plt.figure(figsize=(20, 20))
tree.plot_tree(clf, filled=True, rounded=True, class_names=['0', '1'], feature_names=feature_col_tree)
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

acc_randf = []
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (trn_, val_) in enumerate(kf.split(X=data_tree[feature_col_tree], y=y)):
    X_train = data_tree.loc[trn_, feature_col_tree]
    y_train = data_tree.loc[trn_, target]

    X_val = data_tree.loc[val_, feature_col_tree]
    y_val = data_tree.loc[val_, target]

    clf = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(f"The fold is {fold}:")
    print(classification_report(y_val, y_pred))

    acc = accuracy_score(y_val, y_pred)
    acc_randf.append(acc)
    print(f"Fold: {fold+1}, Accuracy: {acc}")
    print("--------------------------------------------------------------")

    pass

In [None]:
acc_randf

In [None]:
plt.figure(figsize=(20,15))

importance = clf.feature_importances_
idxs = np.argsort(importance)

plt.title("Feature Importance")
plt.barh(range(len(idxs)), importance[idxs], align= 'center')
plt.yticks(range(len(idxs)), [feature_col_tree[i] for i in idxs])
plt.xlabel("Random Forest Feature Importance")
plt.show()

### XGBoost- Extreme Gradient Boost

In [None]:
from xgboost import XGBClassifier

acc_xgb = []
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (trn_, val_) in enumerate(kf.split(X=data_tree[feature_col_tree], y=y)):
    X_train = data_tree.loc[trn_, feature_col_tree]
    y_train = data_tree.loc[trn_, target]

    X_val = data_tree.loc[val_, feature_col_tree]
    y_val = data_tree.loc[val_, target]

    clf = XGBClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(f"The fold is {fold}:")
    print(classification_report(y_val, y_pred))

    acc = accuracy_score(y_val, y_pred)
    acc_xgb.append(acc)
    print(f"Fold: {fold+1}, Accuracy: {acc}")
    print("--------------------------------------------------------------")

    pass

In [None]:
acc_xgb

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
from xgboost import plot_tree
plot_tree(clf,num_trees=0,rankdir="LR",ax=ax)
plt.show()