In [4]:
import numpy as np
import scipy.spatial
from collections import Counter

In [5]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, random_state=42, test_size=0.2)

In [6]:
X_train

array([[4.6, 3.6, 1. , 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [6.7, 3.1, 4.4, 1.4],
       [4.8, 3.4, 1.6, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [6.3, 2.5, 5. , 1.9],
       [6.4, 3.2, 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.2, 4.1, 1.5, 0.1],
       [5.8, 2.7, 5.1, 1.9],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [5.4, 3.9, 1.3, 0.4],
       [5.4, 3.7, 1.5, 0.2],
       [5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5],
       [6.4, 3.1, 5.5, 1.8],
       [6.6, 3. , 4.4, 1.4],
       [7.2, 3.6, 6.1, 2.5],
       [5.7, 2.9, 4.2, 1.3],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [5.1, 3.5, 1.4, 0.2],
       [7.7, 2.8, 6.7, 2. ],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 3.4, 1.4, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 2. , 3.5, 1. ],
       [6.3, 2.7, 4.9, 1.8],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [5.6, 2

In [8]:
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X, Y):
        self.X_train = X
        self.Y_train = Y

    def distance(self, X1, X2):
        distance = scipy.spatial.distance.euclidean(X1, X2)
        return distance

    def predict(self, X_test):
        final_output = []
        for i in range(len(X_test)):
            d = []
            votes = []
            for j in range(len(X_train)):
                dist = self.distance(X_train[j], X_test[i])
                d.append([dist, j])
            d.sort()
            d = d[0:self.k]
            for d, j in d:
                votes.append(Y_train[j])
            ans = Counter(votes).most_common(1)[0][0]
            final_output.append(ans)
        return final_output
    
    def score(self, X_test, Y_test):
        predictions = self.predict(X_test)
        return (predictions == Y_test).sum() / len(Y_test)

In [17]:
scipy.spatial.distance.euclidean((2, 3, 4, 5), (3, 4, 5, 8))

3.4641016151377544

In [21]:
Y_train[2]

1

In [31]:
clf = KNN(5)
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

for i in prediction:
    print(i, end = ' ')

1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 

In [32]:
prediction == Y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [33]:
clf.score(X_test, Y_test)

1.0

In [12]:
prediction

[1,
 0,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 0]

In [23]:
lst = [1, 2, 3, 3, 3, 3, 3]
ans = Counter(lst)

In [30]:
ans.most_common(3)

[(3, 5), (1, 1), (2, 1)]

In [44]:
def find_result(x, X_train, Y_train, k):
    distances = [] 
    results = []
    for i in range(len(X_train)):
        dist = scipy.spatial.distance.euclidean(x, X_train[i])
        distances.append((dist, i))
    distances.sort()
    distances = distances[:k]
    print(distances)
    for d, j in distances:
        results.append(Y_train[j])
    result = Counter(results).most_common(1)[0][0]
    return result


In [45]:
x = [2.1, 5.2, 3.5, 1.2]
k = 3


print(find_result(x, X_train, Y_train, k))

[(3.753664875824692, 31), (3.7629775444453557, 28), (3.8353617821530213, 75)]
0


In [48]:
Y_train[31]

0

In [49]:
Y_train[28]

0

In [50]:
Y_train[75]

0