# Lab 05: KNN 


## Preprocess Raw Data: 


In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

heartData = pd.read_csv("D:\Programming\Python_code\PrinciplesOfDS_Course\Labs\Data\heart.csv")
print(heartData.info()) # all data types int or float, no non-numeric features. No categorical features to convert 

# Check for missing values 
heartData.isna().sum()/heartData.shape[0] # all values 0. No missing values. 

# Visual inspection of target variable
heartData.target.value_counts().plot(kind = 'bar') 
# Data is balanced with no significant dominance of samples by either of two categories. 

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Programming\\Python_code\\PrinciplesOfDS_Course\\Labs\\Data\\heart.csv'

## Split and Normalize Data


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

X, y = heartData.drop(columns='target'), heartData.target

# samples should be divided for same size folds, at 10 folds we need a value divisible by 10 
X.shape[0] # 303 observations
260/303 # 0.858085808580858 ; 85% of data for trainig and ~15% for testing is a good split ratio 


X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size= 0.14, random_state=2)

print(X_train_val.shape) # correct split value achieved 
print(y_train_val.shape)

normalizer = StandardScaler()
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

(260, 13)
(260,)


## Train KNN Model after Selecting Hyperparameter


In [3]:
# Setup range of hyperparameter K to test during training; increasing range to 1-10 instead of 1-5 due to better accuracy score 
k_range = range(1, 11)
param_grid = dict(n_neighbors=k_range)

clf_knn =  KNeighborsClassifier(n_neighbors=1)

# performing grid search with cross validation
grid = GridSearchCV(clf_knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_val, y_train_val)

print(grid.best_score_)
print(grid.best_params_)

0.8192307692307692
{'n_neighbors': 7}


## Evaluate the KNN Model against Test Data


In [4]:
# train model with best hyperparameter
clf_knn =  KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'])
clf_knn.fit(X_train_val, y_train_val)

# evaluate the model on the testing set
y_test_pred = clf_knn.predict(X_test)

In [5]:
# inspecting prediction vs ground truth
print(y_test_pred)
print("y_test", "\n", y_test)


print(y_test_pred == 1) # creating boolean vectors for comparison 
print((y_test_pred == 1) & (y_test == 1)) # true positives .. Ok I see what to do. 


[1 1 1 1 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 0 0 1 0 1
 1 0 0 1 1 0]
y_test 
 99     1
296    0
89     1
30     1
234    0
292    0
35     1
7      1
178    0
13     1
240    0
65     1
179    0
192    0
147    1
134    1
232    0
157    1
276    0
235    0
112    1
159    1
209    0
197    0
259    0
29     1
142    1
169    0
217    0
3      1
160    1
140    1
251    0
257    0
286    0
250    0
133    1
267    0
274    0
297    0
53     1
182    0
231    0
Name: target, dtype: int64
[ True  True  True  True False False  True  True False  True False  True
 False False  True  True False  True False  True  True  True False False
  True  True  True False False  True  True  True False False  True False
  True  True False False  True  True False]
99      True
296    False
89      True
30      True
234    False
292    False
35      True
7       True
178    False
13      True
240    False
65      True
179    False
192    False
147     True
134     True
232    False
157 

In [6]:
# implementing Accuracy, Recall, Precision, and F1:
###################################################
 
# Accuracy = total correct predictions / total predicitons 
accuracy = sum(y_test_pred == y_test) / len(y_test_pred)

# Recall = True Positive / (True Positive + False Negative)
TP = sum((y_test_pred == 1) & (y_test == 1))
FP = sum((y_test_pred == 1) & (y_test == 0))
FN = sum((y_test_pred == 0) & (y_test == 1))

recall = TP / (TP+FN)

# Precision = True Positive / (True Positive + False Positive)
precision = TP / (TP+FP)

# F1 = (2(Precision)(Recall)) / (Precision + Recall)
F1 = (2*precision*recall) / (precision + recall)


print("accuracy:", round(accuracy, 3))
print("recall:", recall)
print("precision:", precision)
print("F1:", round(F1,3))

accuracy: 0.86
recall: 1.0
precision: 0.76
F1: 0.864


In [7]:
# Verify my evaluation metrics using Sci-kit.learn
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

acc = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)


print("accuracy: {:.3f}, recall: {:.3f}, precision: {:.3f}, f1: {:.3f},".format(acc, recall, precision, f1))

accuracy: 0.860, recall: 1.000, precision: 0.760, f1: 0.864,


As can be seen by comparison the evaluation metrics were correctly calculated manually. 
