In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
heart_df = pd.read_csv("heart.csv")
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
X = heart_df.drop("target", axis=1)
y = heart_df["target"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) # not fit in testing data use existing mean and variance of train data
X_train


array([[-1.35679832,  0.72250438,  0.00809909, ...,  0.95390513,
        -0.68970073, -0.50904773],
       [ 0.38508599,  0.72250438, -0.97189094, ...,  0.95390513,
        -0.68970073,  1.17848036],
       [-0.92132724,  0.72250438,  0.98808912, ..., -0.69498803,
        -0.68970073, -0.50904773],
       ...,
       [ 1.58263146,  0.72250438,  1.96807914, ..., -0.69498803,
         0.32186034, -0.50904773],
       [-0.92132724,  0.72250438, -0.97189094, ...,  0.95390513,
        -0.68970073,  1.17848036],
       [ 0.92942484, -1.38407465,  0.00809909, ...,  0.95390513,
         1.33342142, -0.50904773]])

In [5]:
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(X_train, y_train)

y_pred = neigh.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("precision Score: ", precision_score(y_test, y_pred))
print("recall Score: ", recall_score(y_test, y_pred))

Accuracy Score:  0.9180327868852459
precision Score:  0.9354838709677419
recall Score:  0.90625


In [8]:
# Cross Validation for hyperparam tuning using GridSearchCV

from sklearn.model_selection import GridSearchCV

classifier = KNeighborsClassifier()
param_grid = {"n_neighbors":[3,5,7,9]}

classifierCV = GridSearchCV(
    classifier,
    param_grid,
    cv=5,
    scoring="recall"
)

classifierCV.fit(X_train, y_train)
y_pred = classifierCV.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("precision Score: ", precision_score(y_test, y_pred))
print("recall Score: ", recall_score(y_test, y_pred))

#result
res = pd.DataFrame(classifierCV.cv_results_)
print(res[["param_n_neighbors", "mean_test_score"]])

print(classifierCV.best_params_)

Accuracy Score:  0.9180327868852459
precision Score:  0.9354838709677419
recall Score:  0.90625
   param_n_neighbors  mean_test_score
0                  3         0.864387
1                  5         0.857550
2                  7         0.871795
3                  9         0.856980
{'n_neighbors': 7}


In [9]:
from sklearn.pipeline import Pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())
    ]
)

param_grid = {"knn__n_neighbors":[3,5,7,9]}

classifierCV = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="recall"
)

classifierCV.fit(X_train, y_train)
y_pred = classifierCV.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("precision Score: ", precision_score(y_test, y_pred))
print("recall Score: ", recall_score(y_test, y_pred))



Accuracy Score:  0.9180327868852459
precision Score:  0.9354838709677419
recall Score:  0.90625
