This file has code for K Neighbors Classifier and its accuracy will be compared with sklearn using accuracy_score

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

data = pd.read_csv("Diabetes.csv")
data.drop(columns=['Class_'], inplace=True)
data.tail()

Unnamed: 0,Prep,Plas,Pres,Skin,Insu,Mass,Pedi,Age,Class
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [4]:
#Data standardization using StandardScaler
initial_data = data.iloc[:,:-1]
scaler = StandardScaler()
data_scaled = scaler.fit_transform(initial_data)
df = pd.DataFrame(data=data_scaled, columns=initial_data.columns)
df = pd.concat([df, data.iloc[:,-1:]], axis=1)

#Split train and test data
X = df.iloc[:int(len(df) * 0.8),:-1]
Y = df.iloc[:int(len(df) * 0.8),-1:]
X_test = df.iloc[int(len(df) * 0.8):, :-1]
Y_test = df.iloc[int(len(df) * 0.8):, -1:]

In [16]:
#Creating a class for K Neighbors Classifier
class KMeansClassifier:
    def __init__(self, n_neighbors):
        self.k=n_neighbors
        
    def fit(self,X,Y):
        self.X=X
        self.Y=Y
    
    def predict(self,test):
        y_pred = []
        distance = []
        for i in range(test.shape[0]):
            for j in range(0,self.X.shape[0]):
                #Calculate the euclidian distance between each new datapoint and all the existing data points and appending it to a list
                distance.append(np.linalg.norm(np.array(test.iloc[i]) - np.array(self.X.iloc[j])))
        distance_array = np.array(distance).reshape(-1,self.X.shape[0])
        for i in range(0,distance_array.shape[0]):
            #Find K indices that are closer to the new datapoint (K=4 in this case)
            indices = np.sort(np.argpartition(distance_array[i], self.k)[:self.k])
            final_class = []
            for index in indices:
                #Find the category of those K indices belong to and pick the category that occured most for the new datapoint
                final_class.append(self.Y.loc[index, self.Y.columns][0])
            y_pred.append(int(max(set(final_class), key=final_class.count)))
        return np.array(y_pred)
            
model = KMeansClassifier(n_neighbors=4)
model.fit(X,Y)
y_pred = model.predict(X_test)
print("Predictions using custom code:", y_pred)
print("\n")
print("Accuracy using custom code:", accuracy_score(Y_test, y_pred) * 100, "%")

Predictions using custom code: [1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 1 0 1 0 0 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1
 0 0 0 0 1 0]


Accuracy using custom code: 74.02597402597402 %


In [17]:
from sklearn.neighbors import KNeighborsClassifier
model_KNC = KNeighborsClassifier(n_neighbors=4)
model_KNC.fit(X,Y.values.ravel())
y_pred_KNC = model_KNC.predict(X_test)
print("Predictions using sklearn:", y_pred_KNC)
print("\n")
print("Accuracy from sklearn: ", accuracy_score(Y_test, y_pred_KNC) * 100, "%")

Predictions using sklearn: [1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 1 0 1 0 0 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1
 0 0 0 0 1 0]


Accuracy from sklearn:  74.02597402597402 %


In [18]:
print("Accuracy of both models: ", accuracy_score(y_pred, y_pred_KNC) * 100, "%")

Accuracy of both models:  100.0 %


This shows that both models have accuracy of 74.02% when compared with the real data. Also, both models have returned the same output for all the test data and hence the accuracy is 100% between both models