In [3]:
import numpy as np
import matplotlib as plt
import math

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
data = mnist.data / 255.0
answer = mnist.target.astype(int)

train_set, test_set, train_target, test_target = train_test_split(data, answer, test_size=10000)



In [6]:
test_set.shape

(10000, 784)

In [7]:
test_target[:15]

array([9, 6, 7, 0, 3, 1, 7, 1, 8, 2, 9, 7, 8, 6, 7])

In [8]:
def euclidean_distance(x, y):
    return np.sqrt(np.sum(x ** 2, axis=1)[:,np.newaxis] + np.sum(y ** 2, axis=1)
            - 2 * np.dot(x, y.T))


def euclidean_distance1(x, y):
    return np.sqrt(((x[:, np.newaxis] - y) ** 2).sum(axis=2))

def cosine_distance(x, y):
    norma = np.sqrt(np.sum(np.square(x), axis=1).reshape(-1, 1) * np.sum(np.square(y), axis=1)) 
    if len(np.where(norma == 0)[0]) == 0:
        return 1 - np.dot(x, np.transpose(y)) / norma
    return 0
def weight(array):
    eps = 0.00001
    return 1 / (array + eps)

In [9]:
class KNNClassifier:
    def __init__(self, k, strategy, metric, weights, test_block_size):
        self.k = k
        self.strategy = strategy
        self.metric = metric
        self.weights = weights
        self.test_block_size = test_block_size
        if strategy != 'my_own':
            if strategy != 'brute' and strategy != 'kd_tree' and strategy != 'ball_tree':
                raise KeyError
            # change parameters
            self.skln_knn = KNeighborsClassifier(
                n_neighbors=k, algorithm = strategy, metric = metric, weights='uniform' if not weights else weight)
        
    def fit(self, x, y):
        if self.strategy == 'my_own':
            self.train_set = x
            self.train_target = y
        else:
            self.skln_knn.fit(x, y)
    
        
        # here we should apply sklearn
    def find_kneighbors(self, x, return_distance):
        if self.strategy == 'my_own':
            count_of_blocks = math.ceil(x.shape[0] / self.test_block_size)
            if x.shape[0] != count_of_blocks * self.test_block_size:

                if x.shape[0] <= self.test_block_size:
                    blocked_data = x
                else:
                    blocked_data = np.concatenate([np.split(x[:(count_of_blocks - 1) * self.test_block_size], count_of_blocks - 1, axis=0),
                                                  x[(count_of_blocks - 1) * self.test_block_size:]], axis=0)
            else:
                blocked_data = np.split(x, count_of_blocks, axis=0)
            # here we should find maatrix of distances
            ans_matrix = np.array([])
            if self.metric == 'euclidean':
                for mtr in blocked_data:
                    sub_matrix = euclidean_distance1(self.train_set, mtr)
                    if len(ans_matrix) == 0:
                        ans_matrix = sub_matrix
                        continue
                    ans_matrix = np.concatenate([ans_matrix, sub_matrix], axis=1)
            if self.metric == 'cosine':
                for mtr in blocked_data:
                    sub_matrix = cosine_distance(self.train_set, mtr)
                    if len(ans_matrix) == 0:
                        ans_matrix = sub_matrix
                        continue
                    ans_matrix = np.concatenate([ans_matrix, sub_matrix], axis=1)
            ans_matrix = ans_matrix.T
            neighbour = np.sort(ans_matrix, axis=1)[:, :self.k]
            index = np.argsort(ans_matrix, axis=1)[:,:self.k]
            if return_distance:
                return (neighbour, index)
            return index
        else:
            return self.skln_knn.kneighbors(x, n_neighbors=self.k, return_distance=return_distance)
        
    def predict(self, x):
        if self.strategy == 'my_own':
            answer = np.zeros(x.shape[0]).astype(int)
            if self.weights:
                dist, ind = self.find_kneighbors(x, True)
                votes = weight(dist)
                for i, mark in enumerate(self.train_target[ind]):
                    ind_array = np.zeros(10)
                    for j in range(self.k):
                        ind_array[int(mark[j])] += votes[i][j]
                    answer[i] = np.argmax(ind_array)
                return answer
            else:
                for i, mark in enumerate(self.train_target[self.find_kneighbors(x, False)]):
                    answer[i] = np.bincount(mark).argmax()
                return answer
        else:
            return skln_knn.predict(x)


In [39]:
from nearest_neighbors import KNNClassifier as KNC

In [41]:
knn2 = KNC(4, "my_own", "cosine", False)
knn2.fit(train_set, train_target)
new_test_set = test_set[:100]
res_2 = knn2.predict(new_test_set)

In [10]:
knn1 = KNNClassifier(4,'my_own', 'cosine', False, 10)
knn1.fit(train_set, train_target)

res_1 = knn1.predict(test_set[:100])

In [11]:
res_1

array([9, 6, 7, 0, 3, 1, 7, 1, 8, 2, 9, 7, 8, 6, 7, 1, 4, 5, 6, 9, 2, 7,
       1, 1, 8, 9, 2, 2, 8, 0, 9, 1, 3, 3, 3, 9, 2, 8, 3, 1, 0, 5, 6, 1,
       8, 2, 6, 5, 6, 0, 4, 0, 3, 1, 1, 0, 8, 6, 0, 8, 3, 9, 8, 7, 1, 3,
       6, 3, 9, 9, 1, 7, 1, 9, 0, 3, 1, 0, 5, 7, 7, 2, 3, 7, 1, 0, 1, 3,
       7, 7, 1, 9, 0, 6, 4, 3, 7, 6, 0, 2])

In [None]:
len(np.where(ans1 == test_target)[0])/10000

In [None]:
knn = KNNClassifier(2, 'brute', 'euclidean', False, 5)
knn.fit(train_set, train_target)
knn.find_kneighbors(test_set[:15], True)
""" 
            print('here', len(blocked_data))
            # here we should find maatrix of distances
            ans_matrix = np.array([])
            if self.metric == 'euclidean':
                for mtr in blocked_data:
                    sub_matrix = euclidean_distance1(x, mtr)
                    if len(ans_matrix) == 0:
                        ans_matrix = sub_matrix
                        continue
                    ans_matrix = np.concatenate([ans_matrix, sub_matrix], axis=1)
            if self.metric == 'cosine':
                for mtr in blocked_data:
                    sub_matrix = cosine_distance(x, mtr)
                    if len(ans_matrix) == 0:
                        ans_matrix = sub_matrix
                        continue
                    ans_matrix = np.concatenate([ans_matrix, sub_matrix], axis=1)
            #ans_matrix = ans_matrix.T
            """

In [None]:
#distances

x = np.array([[1, 5,7], [4, 7, 0]])
y = np.array([[0, 9, 5], [1, 6, 0], [5, 8, 7], [1, 2, 3]])

euclidean_distance(x, y)

In [None]:
#cross_validation
def k_fold(n, n_folds):
    