In [6]:
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [7]:
#Define KNN Class 
def euclidian_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance


class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions

    def _predict(self, x):
        # compute the distances
        distances = [euclidian_distance(x, x_train) for x_train in X_train]
        # print(f"distances : {distances}")
        # get the closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # majority vote
        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0][0]


In [8]:
df = pd.read_csv('diabetes.csv')
selected_feature = ['Glucose','BMI','Pregnancies']
x= df[selected_feature]
print(x)
# x = df.drop(['Outcome'], axis=1)
y = df['Outcome']
x= x.to_numpy()
y= y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)



     Glucose   BMI  Pregnancies
0        148  33.6            6
1         85  26.6            1
2        183  23.3            8
3         89  28.1            1
4        137  43.1            0
..       ...   ...          ...
763      101  32.9           10
764      122  36.8            2
765      121  26.2            5
766      126  30.1            1
767       93  30.4            1

[768 rows x 3 columns]


In [21]:
clf = KNN(k=5)
clf.fit(X_train,y_train)
# X_test = X_test.to_numpy()
prediction = clf.predict(X_test)


In [22]:
accuracy = np.sum(prediction == y_test)/len(y_test)
accuracy

0.7337662337662337

In [23]:
accuracies = []
for i in range(2,20):
    obj = KNN(k=i)
    obj.fit(X_train,y_train)
    pred = clf.predict(X_test)
    acc =  np.sum(pred == y_test)/len(y_test)
    accuracies.append(acc)

In [28]:
accs = np.argsort(accuracies)
accs

array([ 0, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1, 16,
       17], dtype=int64)

In [35]:
print(f'Max Accuracy for k = {2+accs[0]} which is {accuracies[accs[0]]}')

Max Accuracy for k = 2 which is 0.7337662337662337
