In [1]:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import pairwise_distances as sk_distances
from scipy.spatial import distance as sp_dist
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
dataset = pd.read_csv('diabetes.csv')


In [3]:
social=dataset


In [4]:
social

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [7]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) #avoid data leakage

## KNN with sklearn

In [8]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',p=2)#The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.
classifier.fit(X_train, y_train)
y_pred_skl = classifier.predict(X_test)


## KNN hand implementation

In [9]:
class KNN:
    def __init__(self,k,distance_metric='cosine'):
        self.k=k
        self.metric=distance_metric
        print(self.k)
    def fit(self,X_train,y_train):
        self.x_train=X_train
        self.y_train=y_train
        
    def calculate_euclidean(self,sample1,sample2):
         try:
            return distances([sample1], [sample2], self.distance_metric)[0][0]
         except Exception as e:
            #print('Metric {} is probably not in the metrics list! Error: {}'.format(self.distance_metric, e))
            return 0  
    def nearest_neighbors(self,test_sample):
        distances=[]#calculate distances from a test sample to every sample in a training set
        test_array = np.array([test_sample for i in range(0, len(self.x_train))])
        distances = sk_distances(self.x_train, test_array, self.metric)
        distances = [distances[i][0] for i in range(0, len(distances))]
        neighbors = [self.y_train[i] for i in np.argsort(distances)[0:self.k]]
        
        return neighbors
    def predict(self,test_set):
        predictions=[]
        for test_sample in test_set:
            neighbors=self.nearest_neighbors(test_sample)
            labels=[sample for sample in neighbors]
            prediction=max(labels,key=labels.count)
            predictions.append(prediction)
            #print(predictions)
        return predictions

In [10]:
our_model=KNN(5,'euclidean')
our_model.fit(X_train,y_train)

5


In [11]:
y_pred_hand=our_model.predict(X_test)
print(y_pred_hand)
print("Number of mislabeled points out of a total points : %d"% ((y_pred_skl != y_pred_hand).sum()))
print("Number of mislabeled points out of a total points : %d"% ((y_pred_skl != y_test).sum()))
print("Number of mislabeled points out of a total points : %d"% ((y_pred_hand != y_test).sum()))

[1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Number of mislabeled points out of a total points : 0
Number of mislabeled points out of a total points : 31
Number of mislabeled points out of a total points : 31


In [12]:
cm_hand = confusion_matrix(y_test, y_pred_hand)
print(cm_hand)
accuracy_score(y_test, y_pred_hand)

[[93 14]
 [17 30]]


0.7987012987012987

In [13]:
cm_skl = confusion_matrix(y_test, y_pred_skl)
print(cm_skl)
accuracy_score(y_test, y_pred_skl)

[[93 14]
 [17 30]]


0.7987012987012987

In [14]:
K = [5, 7, 9, 11, 13, 15, 17]
metrics = ['cosine', 'euclidean', 'manhattan']

for i in range(len(metrics)):
    for j in range(len(K)):
        print("Функция расстояния: ", metrics[i], ", модель: ", K[j])
        model=KNN(K[j], metrics[i]) #our model
        model.fit(X_train,y_train)
        predictions=model.predict(X_test)#our model's predictions
        cm = confusion_matrix(y_test, predictions) #our model
        print(cm)
        print(accuracy_score(y_test, predictions))
        print("\n")

Функция расстояния:  cosine , модель:  5
5
[[95 12]
 [15 32]]
0.8246753246753247


Функция расстояния:  cosine , модель:  7
7
[[95 12]
 [17 30]]
0.8116883116883117


Функция расстояния:  cosine , модель:  9
9
[[95 12]
 [17 30]]
0.8116883116883117


Функция расстояния:  cosine , модель:  11
11
[[94 13]
 [17 30]]
0.8051948051948052


Функция расстояния:  cosine , модель:  13
13
[[94 13]
 [18 29]]
0.7987012987012987


Функция расстояния:  cosine , модель:  15
15
[[95 12]
 [18 29]]
0.8051948051948052


Функция расстояния:  cosine , модель:  17
17
[[94 13]
 [19 28]]
0.7922077922077922


Функция расстояния:  euclidean , модель:  5
5
[[93 14]
 [17 30]]
0.7987012987012987


Функция расстояния:  euclidean , модель:  7
7
[[91 16]
 [19 28]]
0.7727272727272727


Функция расстояния:  euclidean , модель:  9
9
[[92 15]
 [20 27]]
0.7727272727272727


Функция расстояния:  euclidean , модель:  11
11
[[92 15]
 [20 27]]
0.7727272727272727


Функция расстояния:  euclidean , модель:  13
13
[[92 15]
 [19 28]