# KNN for prediction heart disease

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("./cleveland.csv", header= None)
df.columns = (["age", "sex", "cp", "restbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"])

df["target"] = df.target.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})
df["thal"] = df.thal.fillna(df.thal.mean())
df["ca"] = df.ca.fillna(df.ca.mean())

In [3]:
X = df.iloc[: , :-1].values
y = df.iloc[: , -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Using KNN

In [4]:
knn = KNeighborsClassifier(n_neighbors= 5, weights= "uniform", algorithm= "auto", leaf_size= 30, p= 2, metric= "minkowski")
knn.fit(X_train, y_train)
y_train_pred_knn = knn.predict(X_train)
y_test_pred_knn = knn.predict(X_test)

cm_train_knn = confusion_matrix(y_train, y_train_pred_knn)
cm_test_knn = confusion_matrix(y_test, y_test_pred_knn)

In [5]:
print()
accuracy_for_train_knn = np.round((cm_train_knn[0][0] + cm_train_knn[1][1]) / len(y_train), 2)
accuracy_for_test_knn = np.round((cm_test_knn[0][0] + cm_test_knn[1][1]) / len(y_test), 2)
print('Accuracy for training set for KNeighborsClassifier = {}'.format(accuracy_for_train_knn))
print('Accuracy for test set for KNeighborsClassifier = {}'.format(accuracy_for_test_knn))


Accuracy for training set for KNeighborsClassifier = 0.76
Accuracy for test set for KNeighborsClassifier = 0.69


# Using SVM

In [6]:
svm = SVC(kernel= "rbf", random_state= 42)
svm.fit(X_train, y_train)

y_train_pred_svm = svm.predict(X_train)
y_test_pred_svm = svm.predict(X_test)

cm_train_svm = confusion_matrix(y_train, y_train_pred_svm)
cm_test_svm = confusion_matrix(y_test, y_test_pred_svm)

In [7]:
print()
accuracy_for_train_svm = np.round((cm_train_svm[0][0] + cm_train_svm[1][1]) / len(y_train), 2)
accuracy_for_test_svm = np.round((cm_test_svm[0][0] + cm_test_svm[1][1]) / len(y_test), 2)
print('Accuracy for training set for SVM = {}'.format(accuracy_for_train_svm))
print('Accuracy for test set for SVM = {}'.format(accuracy_for_test_svm))


Accuracy for training set for SVM = 0.66
Accuracy for test set for SVM = 0.67


# Using naive bayesian

In [8]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_train_pred_gnb = gnb.predict(X_train)
y_test_pred_gnb = gnb.predict(X_test)

cm_train_gnb = confusion_matrix(y_train, y_train_pred_gnb)
cm_test_gnb = confusion_matrix(y_test, y_test_pred_gnb)

In [9]:
print()
accuracy_for_train_gnb = np.round((cm_train_gnb[0][0] + cm_train_gnb[1][1]) / len(y_train), 2)
accuracy_for_test_gnb = np.round((cm_test_gnb[0][0] + cm_test_gnb[1][1]) / len(y_test), 2)
print('Accuracy for training set for naive bayes = {}'.format(accuracy_for_train_gnb))
print('Accuracy for test set for naive bayes = {}'.format(accuracy_for_test_gnb))


Accuracy for training set for naive bayes = 0.85
Accuracy for test set for naive bayes = 0.84


# Using decision tree

In [10]:
dt = DecisionTreeClassifier(criterion= "gini", max_depth= 10, min_samples_split= 2, random_state= 42)
dt.fit(X_train, y_train)

y_train_pred_dt = dt.predict(X_train)
y_test_pred_dt = dt.predict(X_test)

cm_train_dt = confusion_matrix(y_train, y_train_pred_dt)
cm_test_dt = confusion_matrix(y_test, y_test_pred_dt)

In [11]:
print()
accuracy_for_train_dt = np.round((cm_train_dt[0][0] + cm_train_dt[1][1]) / len(y_train), 2)
accuracy_for_test_dt = np.round((cm_test_dt[0][0] + cm_test_dt[1][1]) / len(y_test), 2)
print('Accuracy for training set for decision tree = {}'.format(accuracy_for_train_dt))
print('Accuracy for test set for decision tree = {}'.format(accuracy_for_test_dt))


Accuracy for training set for decision tree = 1.0
Accuracy for test set for decision tree = 0.75


# Using random forest

In [12]:
rfc = RandomForestClassifier(criterion= "gini", max_depth= 10, min_samples_split= 2, n_estimators= 10, random_state= 42)
rfc.fit(X_train, y_train)

y_train_pred_rfc = rfc.predict(X_train)
y_test_pred_rfc = rfc.predict(X_test)

cm_train_rfc = confusion_matrix(y_train, y_train_pred_rfc)
cm_test_rfc = confusion_matrix(y_test, y_test_pred_rfc)

In [13]:
print()
accuracy_for_train_rfc = np.round((cm_train_rfc[0][0] + cm_train_rfc[1][1]) / len(y_train), 2)
accuracy_for_test_rfc = np.round((cm_test_rfc[0][0] + cm_test_rfc[1][1]) / len(y_test), 2)
print('Accuracy for training set for random forest = {}'.format(accuracy_for_train_rfc))
print('Accuracy for test set for random forest = {}'.format(accuracy_for_test_rfc))


Accuracy for training set for random forest = 0.98
Accuracy for test set for random forest = 0.8


# Using adaboost

In [14]:
ada = AdaBoostClassifier(n_estimators= 50, learning_rate= 1.0)
ada.fit(X_train, y_train)

y_train_pred_ada = ada.predict(X_train)
y_test_pred_ada = ada.predict(X_test)

cm_train_ada = confusion_matrix(y_train, y_train_pred_ada)
cm_test_ada = confusion_matrix(y_test, y_test_pred_ada)



In [15]:
print()
accuracy_for_train_ada = np.round((cm_train_ada[0][0] + cm_train_ada[1][1]) / len(y_train), 2)
accuracy_for_test_ada = np.round((cm_test_ada[0][0] + cm_test_ada[1][1]) / len(y_test), 2)
print('Accuracy for training set for ada boost = {}'.format(accuracy_for_train_ada))
print('Accuracy for test set for ada boost = {}'.format(accuracy_for_test_ada))


Accuracy for training set for ada boost = 0.91
Accuracy for test set for ada boost = 0.84


# Using gradient boost

In [16]:
gb = GradientBoostingClassifier(learning_rate= 0.1, n_estimators= 100, subsample= 1.0, min_samples_split= 2, max_depth= 3, random_state= 42)
gb.fit(X_train, y_train)

y_train_pred_gb = gb.predict(X_train)
y_test_pred_gb = gb.predict(X_test)

cm_train_gb = confusion_matrix(y_train, y_train_pred_gb)
cm_test_gb = confusion_matrix(y_test, y_test_pred_gb)

In [17]:
print()
accuracy_for_train_gb = np.round((cm_train_gb[0][0] + cm_train_gb[1][1]) / len(y_train), 2)
accuracy_for_test_gb = np.round((cm_test_gb[0][0] + cm_test_gb[1][1]) / len(y_test), 2)
print('Accuracy for training set for gradient boost = {}'.format(accuracy_for_train_gb))
print('Accuracy for test set for gradient boost = {}'.format(accuracy_for_test_gb))


Accuracy for training set for gradient boost = 1.0
Accuracy for test set for gradient boost = 0.85


# Using XG boost

In [18]:
xgb = XGBClassifier(objective= "binary:logistic", random_state= 42, n_estimators= 100)
xgb.fit(X_train, y_train)

y_train_pred_xgb = xgb.predict(X_train)
y_test_pred_xgb = xgb.predict(X_test)

cm_train_xgb = confusion_matrix(y_train_pred_xgb, y_train)
cm_test_xgb = confusion_matrix(y_test_pred_xgb, y_test)

In [19]:
print()
accuracy_for_train_xgb = np.round((cm_train_xgb[0][0] + cm_train_xgb[1][1]) / len(y_train), 2)
accuracy_for_test_xgb = np.round((cm_test_xgb[0][0] + cm_test_xgb[1][1]) / len(y_test), 2)
print('Accuracy for training set for XG boost = {}'.format(accuracy_for_train_xgb))
print('Accuracy for test set for XG boost = {}'.format(accuracy_for_test_xgb))


Accuracy for training set for XG boost = 1.0
Accuracy for test set for XG boost = 0.87


# Using stacking

In [22]:
dtc = DecisionTreeClassifier(random_state= 42)
rfc = RandomForestClassifier(random_state= 42)
knn = KNeighborsClassifier()
xgb = XGBClassifier()
gc = GradientBoostingClassifier(random_state= 42)
svmc = SVC(kernel= "rbf", random_state= 42)
ada = AdaBoostClassifier(random_state= 42)

estimators = [("dtc", dtc), ("rfc", rfc), ("knn", knn), ("xgb", xgb), ("gc", gc), ("svmc", svmc), ("ada", ada)]
meta_model = XGBClassifier()
stacking_model = StackingClassifier(estimators= estimators, final_estimator= meta_model)

stacking_model.fit(X_train, y_train)

y_train_pred_stm = stacking_model.predict(X_train)
y_test_pred_stm = stacking_model.predict(X_test)

cm_train_stm = confusion_matrix(y_train, y_train_pred_stm)
cm_test_stm = confusion_matrix(y_test, y_test_pred_stm)



In [24]:
print()
accuracy_for_train_stm = np.round((cm_train_stm[0][0] + cm_train_stm[1][1]) / len(y_train), 2)
accuracy_for_test_stm = np.round((cm_test_stm[0][0] + cm_test_stm[1][1]) / len(y_test), 2)
print('Accuracy for training set for stacking = {}'.format(accuracy_for_train_stm))
print('Accuracy for test set for XG stacking = {}'.format(accuracy_for_test_stm))


Accuracy for training set for stacking = 0.98
Accuracy for test set for XG stacking = 0.89
