In [22]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier as sKNN
from random import shuffle
from random import seed

In [15]:
def euclidean_distance(arr1, arr2):
    res_arr = np.empty((arr1.shape[0], arr2.shape[0]))
    for enum, item in enumerate(arr1[:, ]):
        res_arr[enum] = np.sqrt(np.sum((arr2 - item) ** 2, axis=1))

    return res_arr

def cosine_distance(arr1, arr2):

    res_arr = np.empty((arr1.shape[0], arr2.shape[0]))
    for enum, item in enumerate(arr1[:, ]):
        res_arr[enum] = np.sum(arr2 * item, axis=1) / (
            (np.sqrt(np.sum(arr2 * arr2, axis=1)) *
             np.sqrt((np.sum(item * item)))))

    return 1 - res_arr

def weight_function(distance):
    epsilon = 0.00001
    return (double)(1 / (distance + epsilon))

In [112]:
class KNNClassifier:
    
    def __init__(self,
                 k,
                 strategy,
                 metric,
                 weights,
                 test_block_size = False):
        
        self.strategy = strategy
        
        if (strategy == "my_own"):
            print("My_own")
            # my implementation
            self.k = k
            self.weights = weights
            
            if (metric == "euclidean"):
                self.metric = euclidean_distance
            elif (metric == "cosine"):
                self.metric = cosine_distance
            else:
                raise TypeError
                
        else:
            return sKNN(k, algorithm = strategy, weights = "uniform")
        """        
        elif (strategy == "brute"):
            print("brute")
            if (weights):
                self.uses = sKNN(k,
                            algorithm = 'brute',
                            weights = weight_function,
                            n_jobs = -1)
            else:
                self.uses = sKNN(k,
                                 algorithm = 'brute',
                                 weights = 'uniform',
                                 n_jobs = -1)
        elif (strategy == 'kd_tree'):
            print("kd_tree")
            if (weights):
                self.uses = sKNN(k,
                                 algorithm = 'kd_tree',
                                 weights = weight_function,
                                 n_jobs = -1)
            else:
                self.uses = sKNN(k,
                                 algorithm = 'kd_tree',
                                 weights = 'uniform',
                                 n_jobs = -1)
        elif (strategy == 'ball_tree'):
            print("ball_tree")
            if (weights):
                self.uses = sKNN(k, 
                                 algorithm = 'ball_tree',
                                 weights = weight_function,
                                 n_jobs = -1)
            else:
                self.uses = sKNN(k,
                                 algorithm = 'ball_tree',
                                 weights = 'uniform',
                                 n_jobs = -1)
        else:
            raise TypeError
            
        """
    
    def fit(self, X, y):
        if (self.strategy == "my_own"):
            print("fit_own")
            # my implementation
            if (X.__len__() != y.__len__()):
                raise TypeError
            
            self.data = X
            self.target = y
            self.clusters = np.sort(np.unique(y))
            self.clusters_amount = np.unique(y).__len__()
        else:
            print("fit_other")
            self.uses.fit(X, y)
    
    def find_kneighbors(self, X, return_distance):
        if (strategy == "my_own"):
            #my implementation
            pass
        else:
            return self.uses.kneighbors(X, return_distance)
    
    def predict(self, X):

        if (self.strategy == "my_own"):
            print("predict_own")
            #my prediction
            if (X.shape[1] != self.data.shape[1]):
                return TypeError

            ranges = self.metric(self.data, X)

            max_range = np.max(ranges)

            k = self.k

            if (k > self.data.shape[0]):
                k = self.data.shape[0]

            closest = np.empty((k, X.shape[0])).astype(int)

            while (k > 0):

                # тут мы получаем в столбцах номера выходных точек по каждой из точек test_set
                save = np.argmin(ranges, axis=0)

                closest[k - 1] = save

                for enum, item in enumerate(save):
                    ranges[item, enum] += max_range

                k -= 1

            closest_4_each = closest.T
            test_target = np.empty(X.shape[0]).astype(int)

            for enum, item in enumerate(closest_4_each):

                cluster_nb = np.zeros(self.clusters_amount).astype(int)

                for it in item:
                    cluster_nb[np.where(self.clusters ==self.target[it])[0]] += 1

                test_target[enum] = self.clusters[self.clusters[np.argmax(cluster_nb)]]


            return test_target

        else:
            print("predict_other")
            return self.uses.predict(X)

In [40]:
class cross_validation:
    
    def __init__(self):
        pass
    
    def kfold(n, n_folds,
              stratified = False,
              random_seed = np.nan):
        
        index_list = []
        for index in range(n):
            index_list.append(index)
            
        if (stratified):
            if (random_seed != np.nan):
                seed(random_seed)
                
            shuffle(index_list)
            
        result_list = []
        part = 0
        each_len = (int)(n / n_folds)
        
        while (part < n_folds - 1):
            test_subset = index_list[part * each_len : (part + 1) * each_len]
            train_subset = [x for x in index_list if x not in test_subset]
            result_list.append((train_subset, test_subset))
            part += 1
            
        test_subset = index_list[part * each_len : ]
        train_subset = [x for x in index_list if x not in test_subset]
        
        result_list.append((train_subset, test_subset))
        
        return result_list
    
    def knn_cross_val_score(self, X, y, k_list, score, cv, **kwargs):
        pass

In [18]:
x = np.array([[1, 2, 3], [1, 4, 7], [3, 1, 6]])
y = np.array([2, 2, 2])

In [20]:
euclidean_distance(x, y[:, np.newaxis])

array([[1.41421356, 1.41421356, 1.41421356],
       [5.47722558, 5.47722558, 5.47722558],
       [4.24264069, 4.24264069, 4.24264069]])

In [42]:
res = cross_validation.kfold(55, 5, True)

In [45]:
for item in res:
    print("Train: ", item[0])
    print("Test: ", item[1])

Train:  [0, 4, 42, 20, 14, 53, 44, 47, 46, 41, 11, 3, 17, 15, 10, 21, 27, 28, 50, 23, 36, 45, 9, 34, 49, 6, 39, 18, 8, 52, 13, 37, 22, 30, 19, 25, 31, 32, 16, 2, 26, 48, 24, 54]
Test:  [29, 12, 40, 43, 33, 7, 5, 38, 1, 51, 35]
Train:  [29, 12, 40, 43, 33, 7, 5, 38, 1, 51, 35, 3, 17, 15, 10, 21, 27, 28, 50, 23, 36, 45, 9, 34, 49, 6, 39, 18, 8, 52, 13, 37, 22, 30, 19, 25, 31, 32, 16, 2, 26, 48, 24, 54]
Test:  [0, 4, 42, 20, 14, 53, 44, 47, 46, 41, 11]
Train:  [29, 12, 40, 43, 33, 7, 5, 38, 1, 51, 35, 0, 4, 42, 20, 14, 53, 44, 47, 46, 41, 11, 9, 34, 49, 6, 39, 18, 8, 52, 13, 37, 22, 30, 19, 25, 31, 32, 16, 2, 26, 48, 24, 54]
Test:  [3, 17, 15, 10, 21, 27, 28, 50, 23, 36, 45]
Train:  [29, 12, 40, 43, 33, 7, 5, 38, 1, 51, 35, 0, 4, 42, 20, 14, 53, 44, 47, 46, 41, 11, 3, 17, 15, 10, 21, 27, 28, 50, 23, 36, 45, 30, 19, 25, 31, 32, 16, 2, 26, 48, 24, 54]
Test:  [9, 34, 49, 6, 39, 18, 8, 52, 13, 37, 22]
Train:  [29, 12, 40, 43, 33, 7, 5, 38, 1, 51, 35, 0, 4, 42, 20, 14, 53, 44, 47, 46, 41, 11, 

In [53]:
from sklearn.datasets import fetch_mldata

In [54]:
mnist = fetch_mldata("MNIST-original")



In [55]:
from sklearn.model_selection import train_test_split

In [57]:
trX, teX, trY, teY = train_test_split(mnist.data / 255.0, mnist.target.astype("int0"), test_size = 1/200)
print("Train size: ", trX.__len__())
print("Test size: ", teX.__len__())

Train size:  69650
Test size:  350


In [97]:
KNN = KNNClassifier(5, "my_own", "cosine", False)

My_own


In [98]:
KNN.fit(trX, trY)

fit_own


In [99]:
res = KNN.predict(teX)

predict_own


In [100]:
from sklearn.metrics import accuracy_score

In [101]:
accuracy_score(teY, res)

0.9742857142857143

In [118]:
model = sKNN(n_neighbors = 5, algorithm = "kd_tree", metric = "cosine", weights = "uniform")

ValueError: Metric 'cosine' not valid. Use sorted(sklearn.neighbors.VALID_METRICS['kd_tree']) to get valid options. Metric can also be a callable function.

In [103]:
model.fit(trX, trY)

fit_other


In [104]:
res_1 = model.predict(teX)

predict_other


In [105]:
accuracy_score(teY, res_1)

0.9628571428571429

In [106]:
model_2 = KNNClassifier(5, "brute", "euclidean", False)

brute


In [107]:
model_2.fit(trX, trY)

fit_other


In [108]:
res_2 = model_2.predict(teX)

predict_other


In [109]:
accuracy_score(teY, res_2)

0.9628571428571429