In [1]:
%load_ext Cython
%load_ext memory_profiler

In [2]:
# Change Working Directory To Allow knn Imports
import os 
os.chdir('../')

In [3]:
import numpy as np

# MNIST Data
# Load Data
mnist_data = np.load('./sample_data/mnist/mnist_data.npz')
train_data = mnist_data['train_data']
test_data = mnist_data['test_data']

# Subset Data If Desired
test_labels = test_data[:100, 0]
test_data = test_data[:100, 1:4].astype(np.float)
train_labels = train_data[100:1100, 0]
train_data = train_data[100:1100, 1:4].astype(np.float)

In [4]:
from collections import Counter
import heapq

#Wrapper Over heapq
class PriorityQueue:

    def __init__(self, is_min_heap=True):
        self.queue = []
        self.cnt = Counter()
        self.is_min_heap = is_min_heap

    def heappush(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappush(self.queue, (q_priority, self.cnt[priority], value))

    def heappushpop(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappushpop(self.queue, (q_priority, self.cnt[priority], value))

    def heappop(self):
        curr_top = heapq.heappop(self.queue)
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]

    def peektop(self):
        curr_top = self.queue[0]
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]
    
def create_ball_tree(data, leaf_size):

    if data.shape[0] <= leaf_size:
        leaf_node = {}
        leaf_node["center"] = np.mean(data, axis=0)[np.newaxis, :]
        leaf_node["radius"] = np.max(metric(data, leaf_node["center"]))
        leaf_node["data"] = data
        return leaf_node

    # Random Point x0
    rand_index = np.random.choice(data.shape[0], 1, replace=False)
    rand_point = data[rand_index, :]

    # Find Maximal Point x1
    distances = metric(data, rand_point)
    ind_of_max_dist = np.argmax(distances)
    max_vector_1 = data[ind_of_max_dist, :]

    # Find Maximal Point x2
    distances = metric(data, max_vector_1[np.newaxis, :])
    ind_of_max_dist = np.argmax(distances)
    max_vector_2 = data[ind_of_max_dist, :]

    # Project Data
    proj_data = data.dot(max_vector_1-max_vector_2)

    # Find Median And Split Data
    median_ind = np.argpartition(proj_data, proj_data.size//2)
    lower_than_med_inds = median_ind[:proj_data.size//2]
    greater_than_med_inds = median_ind[proj_data.size//2:]

    # Create Circle
    center = np.mean(data, axis=0)
    radius = np.max(metric(data, center[np.newaxis, :]))

    internal_node = {}
    internal_node["center"] = center[np.newaxis, :]
    internal_node["radius"] = radius
    internal_node["left_child"] = create_ball_tree(data[lower_than_med_inds], leaf_size)
    internal_node["right_child"] = create_ball_tree(data[greater_than_med_inds], leaf_size)

    return internal_node

def query_ball_tree(target_vect, k, queue, curr_node):

    # Prune This Ball
    if metric(target_vect, curr_node["center"]) - curr_node["radius"] >= queue.peektop()[0]:
        return queue

    # Currently A Leaf Node
    if "data" in curr_node:
        for point in curr_node["data"]:
            dist = np.asscalar(metric(target_vect, point[np.newaxis, :]))
            if dist < queue.peektop()[0]:
                queue.heappushpop(dist, point)

    # Not Leaf So Explore Children
    else:
        child1 = curr_node["left_child"]
        child2 = curr_node["right_child"]

        child1_dist = metric(child1["center"], target_vect)
        child2_dist = metric(child2["center"], target_vect)

        if child1_dist < child2_dist:
            query_ball_tree(target_vect, k, queue, child1)
            query_ball_tree(target_vect, k, queue, child2)
        else:
            query_ball_tree(target_vect, k, queue, child2)
            query_ball_tree(target_vect, k, queue, child1)
            
            
def classify_ball_tree(test_data, train_data,labels, k, tree):
    output_labels = []
    for test_vector in test_data:
        queue = PriorityQueue(False)
        # Fill queue With High Distance Points
        list(map(lambda x: queue.heappush(9e10, np.array([9e10, 9e10])), range(k)))
        query_ball_tree(test_vector[np.newaxis, :], k, queue, tree)
        nn_points = np.array([x[2] for x in queue.queue])[:, np.newaxis]
        predicted_labels = labels[np.where((train_data == nn_points).all(-1))[1]]
        output_labels.append(np.bincount(predicted_labels).argmax())
    return np.array(output_labels)

## Baseline

In [5]:
from knn.distance_metrics import euclidean

metric = euclidean
ball_tree = create_ball_tree(train_data, 25)
%timeit ball_tree = create_ball_tree(train_data, 25)
%timeit classify_ball_tree(train_data, train_data, train_labels, 3, ball_tree)

12.5 ms ± 162 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
905 ms ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%cython 
import cython
import numpy as np
cimport numpy as np
from knn.distance_metrics import euclidean

@cython.boundscheck(False)
@cython.wraparound(False)
def query_ball_tree_cyth(np.ndarray[double, ndim=2] target_vect, int k, queue, dict curr_node):

    
    cdef size_t i
    cdef int numb_points
    #cdef float dist
    cdef np.ndarray[double, ndim=2] dists
    
    cdef dict child1
    cdef dict child2
    
    cdef double child1_dist
    cdef double child2_dist
    
    # Prune This Ball
    if euclidean(target_vect, curr_node["center"]) - curr_node["radius"] >= queue.peektop()[0]:
        return queue
    
    if "data" in curr_node:
        
        numb_points = curr_node["data"].shape[0]
        dists = euclidean(target_vect, curr_node["data"])
        
        for i in range(numb_points):
            if dists[i] < queue.peektop()[0]:
                queue.heappushpop(np.asscalar(dists[i]), curr_node["data"][i, :])
    

    # Not Leaf So Explore Children
    else:
       
        child1_dist = euclidean(curr_node["left_child"]["center"], target_vect)
        child2_dist = euclidean(curr_node["right_child"]["center"], target_vect)

        if child1_dist < child2_dist:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"])
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"])
        else:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"])
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"])
            

In [7]:
def classify_ball_tree_cyth(test_data, train_data,labels, k, tree):
    output_labels = []
    for test_vector in test_data:
        queue = PriorityQueue(False)
        # Fill queue With High Distance Points
        list(map(lambda x: queue.heappush(9e10, np.array([9e10, 9e10])), range(k)))
        query_ball_tree_cyth(test_vector[np.newaxis, :], k, queue, tree)
        nn_points = np.array([x[2] for x in queue.queue])[:, np.newaxis]
        predicted_labels = labels[np.where((train_data == nn_points).all(-1))[1]]
        output_labels.append(np.bincount(predicted_labels).argmax())
    return np.array(output_labels)

In [8]:
ball_tree = create_ball_tree(train_data, 25)
%timeit classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree)

682 ms ± 22.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Precompute Distance Between Test Points And Centroids

In [9]:
import math

leaf_size = 10
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
print("Number of Leafs: " + str(numb_of_leafs))




tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
print("Tree Height: " +  str(tree_height))


numb_of_nodes = int(2 ** tree_height) - 1 
print("Number Of Nodes: " +  str(numb_of_nodes))

centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))

print(centroids_s.shape)


def create_ball_tree(data, leaf_size, centroids, index):
    
    
    if data.shape[0] <= leaf_size:
        leaf_node = {}
        leaf_node["center"] = np.mean(data, axis=0)[np.newaxis, :]
        centroids[index, :] = leaf_node["center"] # New
        leaf_node["radius"] = np.max(metric(data, leaf_node["center"]))
        leaf_node["data"] = data
        
        
        
        return leaf_node

    # Random Point x0
    rand_index = np.random.choice(data.shape[0], 1, replace=False)
    rand_point = data[rand_index, :]

    # Find Maximal Point x1
    distances = metric(data, rand_point)
    ind_of_max_dist = np.argmax(distances)
    max_vector_1 = data[ind_of_max_dist, :]

    # Find Maximal Point x2
    distances = metric(data, max_vector_1[np.newaxis, :])
    ind_of_max_dist = np.argmax(distances)
    max_vector_2 = data[ind_of_max_dist, :]

    # Project Data
    proj_data = data.dot(max_vector_1-max_vector_2)

    # Find Median And Split Data
    median_ind = np.argpartition(proj_data, proj_data.size//2)
    lower_than_med_inds = median_ind[:proj_data.size//2]
    greater_than_med_inds = median_ind[proj_data.size//2:]

    # Create Circle
    center = np.mean(data, axis=0)
    radius = np.max(metric(data, center[np.newaxis, :]))
    
    
    
    left_index = 2 * index + 1
    right_index = left_index + 1

    internal_node = {}
    internal_node["center"] = center[np.newaxis, :]
    centroids[index, :] = internal_node["center"] # New
    internal_node["radius"] = radius
    internal_node["left_child"] = create_ball_tree(data[lower_than_med_inds], leaf_size, centroids, left_index)
    internal_node["right_child"] = create_ball_tree(data[greater_than_med_inds], leaf_size, centroids, right_index)

    return internal_node



Number of Leafs: 100
Tree Height: 8
Number Of Nodes: 255
(255, 3)


In [10]:
leaf_size = 20
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
numb_of_nodes = int(2 ** tree_height) - 1 
centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))


ball_tree = create_ball_tree(train_data, leaf_size, centroids_s, 0)

centroid_dists = euclidean(test_data, centroids_s)

In [11]:
%%cython
import cython
import numpy as np
cimport numpy as np
from knn.distance_metrics import euclidean

@cython.boundscheck(False)
@cython.wraparound(False)
def query_ball_tree_cyth(np.ndarray[double, ndim=2] target_vect, int k, queue, dict curr_node, np.ndarray[double, ndim=1] centroid_dists, int index):

    cdef size_t i
    cdef int numb_points
    cdef np.ndarray[double, ndim=2] dists
    cdef left_index, right_index
    
    # Prune This Ball
    if centroid_dists[index] - curr_node["radius"] >= queue.peektop()[0]:
        return queue
    
    # Is Leaf
    if "data" in curr_node:
        
        numb_points = curr_node["data"].shape[0]
        dists = euclidean(target_vect, curr_node["data"])
        
        for i in range(numb_points):
            if dists[i] < queue.peektop()[0]:
                queue.heappushpop(np.asscalar(dists[i]), curr_node["data"][i, :])
    

    # Not Leaf So Explore Children
    else:
        
        left_index = 2 * index + 1
        right_index = left_index + 1
       
        if centroid_dists[left_index] < centroid_dists[right_index]:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
        else:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)

In [12]:
def classify_ball_tree_cyth(test_data, train_data, labels, k, tree, centroids):
    
    centroid_dists = euclidean(centroids, test_data)
    
    
    output_labels = []
    for i, test_vector in enumerate(test_data):
        queue = PriorityQueue(False)
        # Fill queue With High Distance Points
        list(map(lambda x: queue.heappush(9e10, np.array([9e10, 9e10])), range(k)))
        query_ball_tree_cyth(test_vector[np.newaxis, :], k, queue, tree, centroid_dists[i, :],0)
        nn_points = np.array([x[2] for x in queue.queue])[:, np.newaxis]
        predicted_labels = labels[np.where((train_data == nn_points).all(-1))[1]]
        output_labels.append(np.bincount(predicted_labels).argmax())
    return np.array(output_labels)

In [13]:
leaf_size = 10
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
numb_of_nodes = int(2 ** tree_height) - 1 
centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))

ball_tree = create_ball_tree(train_data, leaf_size, centroids_s, 0)

result = classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree, centroids_s)

In [14]:
%timeit classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree, centroids_s)

248 ms ± 8.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Add Cython To Exterior Method

In [15]:
%%cython
import cython
import numpy as np
cimport numpy as np
from knn.distance_metrics import euclidean
from collections import Counter
import heapq

#Wrapper Over heapq
class PriorityQueue:

    def __init__(self, is_min_heap=True):
        self.queue = []
        self.cnt = Counter()
        self.is_min_heap = is_min_heap

    def heappush(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappush(self.queue, (q_priority, self.cnt[priority], value))

    def heappushpop(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappushpop(self.queue, (q_priority, self.cnt[priority], value))

    def heappop(self):
        curr_top = heapq.heappop(self.queue)
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]

    def peektop(self):
        curr_top = self.queue[0]
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]


@cython.boundscheck(False)
@cython.wraparound(False)
def query_ball_tree_cyth(np.ndarray[double, ndim=2] target_vect, int k, queue, dict curr_node, np.ndarray[double, ndim=1] centroid_dists, int index):

    cdef size_t i
    cdef int numb_points
    cdef np.ndarray[double, ndim=2] dists
    
    # Prune This Ball
    if centroid_dists[index] - curr_node["radius"] >= queue.peektop()[0]:
        return queue
    
    # Is Leaf
    if "data" in curr_node:
        
        numb_points = curr_node["data"].shape[0]
        dists = euclidean(target_vect, curr_node["data"])
        
        for i in range(numb_points):
            if dists[i] < queue.peektop()[0]:
                queue.heappushpop(np.asscalar(dists[i]), curr_node["data"][i, :])
    
    # Not Leaf So Explore Children
    else:
        
        left_index = 2 * index + 1
        right_index = left_index + 1
       
        if centroid_dists[left_index] < centroid_dists[right_index]:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists, left_index)
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists, right_index)
        else:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists, right_index)
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists, left_index)
            
@cython.boundscheck(False)
@cython.wraparound(False)        
def classify_ball_tree_cyth(np.ndarray[double, ndim=2] test_data, np.ndarray[double, ndim=2] train_data,
                            np.ndarray[int, ndim=1]labels, int k, dict tree, np.ndarray[double, ndim=2]centroids):
    
    cdef size_t i, j, l, m
    cdef int numb_test_points = test_data.shape[0]
    cdef int numb_train_points = train_data.shape[0]
    cdef int numb_features = test_data.shape[1]
    
    
    cdef np.ndarray[double, ndim=2] centroid_dists = euclidean(centroids, test_data)
    
    
    cdef np.ndarray[int, ndim=1] output_labels = np.zeros(numb_test_points, dtype=np.int32)
    
    cdef np.ndarray[int, ndim=1] predicted_labels = np.zeros(k, dtype=np.int32)
    
    cdef np.ndarray[double, ndim=3] nn_points
    
    
    
    print(queues.shape)
    
    for i in range(numb_test_points):
        queue = PriorityQueue(False)
        list(map(lambda x: queue.heappush(9e10, np.array([9e10, 9e10])), range(k)))
        query_ball_tree_cyth(test_data[i, :][np.newaxis, :], k, queue, tree, centroid_dists[i, :], 0)
        
        
        nn_points = np.array([x[2] for x in queue.queue])[:, np.newaxis]
        predicted_labels = labels[np.where((train_data == nn_points).all(-1))[1]]
        
        
        
#         for j in range(k):
#             for l in range(numb_train_points):
#                 equal = True
#                 for m in range(numb_features):
#                     if queue.queue[j][2][m] != train_data[l, m]:
#                         equal = False
#                 if equal:
#                     predicted_labels[j] = labels[l]
                
        output_labels[i] = np.bincount(predicted_labels).argmax()
    

    return output_labels


Error compiling Cython file:
------------------------------------------------------------
...
    
    cdef np.ndarray[double, ndim=3] nn_points
    
    
    
    print(queues.shape)
         ^
------------------------------------------------------------

/Users/Johnny/.ipython/cython/_cython_magic_4a6d07f4c6b44c21cc1cd297d6312689.pyx:94:10: undeclared name not builtin: queues


TypeError: object of type 'NoneType' has no len()

In [None]:
from knn.distance_metrics import manhattan

metric = manhattan

ball_tree = create_ball_tree(train_data, leaf_size, centroids_s, 0)

%timeit result = classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree, centroids_s)

## Store Index

In [16]:
print(test_data.shape)
print(np.arange(test_data.shape[0]).shape)

test_data_with_inds = np.hstack((np.arange(test_data.shape[0])[:, np.newaxis], test_data))
train_data_with_inds = np.hstack((np.arange(train_data.shape[0])[:, np.newaxis], train_data))

print(test_data_with_inds.shape)
print(test_data_with_inds[1:5, 0])

(100, 3)
(100,)
(100, 4)
[1. 2. 3. 4.]


In [17]:
def create_ball_tree(data, leaf_size, centroids, index):
    
    
    if data.shape[0] <= leaf_size:
        leaf_node = {}
        leaf_node["center"] = np.mean(data[:, 1:], axis=0)[np.newaxis, :]
        centroids[index, :] = leaf_node["center"] # New
        leaf_node["radius"] = np.max(metric(data[:, 1:], leaf_node["center"]))
        leaf_node["data"] = data
        return leaf_node

    # Random Point x0
    rand_index = np.random.choice(data.shape[0], 1, replace=False)
    rand_point = data[rand_index, 1:]
    
    

    # Find Maximal Point x1
    distances = metric(data[:, 1:], rand_point)
    ind_of_max_dist = np.argmax(distances)
    max_vector_1 = data[ind_of_max_dist, 1:]
    
    

    # Find Maximal Point x2
    distances = metric(data[:, 1:], max_vector_1[np.newaxis, :])
    ind_of_max_dist = np.argmax(distances)
    max_vector_2 = data[ind_of_max_dist, 1:]
    
    

    # Project Data
    proj_data = data[:, 1:].dot(max_vector_1-max_vector_2)

    # Find Median And Split Data
    median_ind = np.argpartition(proj_data, proj_data.size//2)
    lower_than_med_inds = median_ind[:proj_data.size//2]
    greater_than_med_inds = median_ind[proj_data.size//2:]

    # Create Circle
    center = np.mean(data[:, 1:], axis=0)
    radius = np.max(metric(data[:, 1:], center[np.newaxis, :]))
    
    
    
    left_index = 2 * index + 1
    right_index = left_index + 1

    internal_node = {}
    internal_node["center"] = center[np.newaxis, :]
    centroids[index, :] = internal_node["center"] # New
    internal_node["radius"] = radius
    internal_node["left_child"] = create_ball_tree(data[lower_than_med_inds], leaf_size, centroids, left_index)
    internal_node["right_child"] = create_ball_tree(data[greater_than_med_inds], leaf_size, centroids, right_index)

    return internal_node

In [18]:
leaf_size = 10
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
numb_of_nodes = int(2 ** tree_height) - 1 
centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))

ball_tree = create_ball_tree(train_data_with_inds, leaf_size, centroids_s, 0)

In [19]:
%%cython 
import cython
import numpy as np
cimport numpy as np
from knn.distance_metrics import euclidean

@cython.boundscheck(False)
@cython.wraparound(False)
def query_ball_tree_cyth(np.ndarray[double, ndim=2] target_vect, int k, queue, dict curr_node, np.ndarray[double, ndim=1] centroid_dists, int index):

    cdef size_t i
    cdef int numb_points
    cdef np.ndarray[double, ndim=2] dists
    cdef left_index, right_index
    
    # Prune This Ball
    if centroid_dists[index] - curr_node["radius"] >= queue.peektop()[0]:
        return queue
    
    # Is Leaf
    if "data" in curr_node:
        
        numb_points = curr_node["data"].shape[0]
        dists = euclidean(target_vect, curr_node["data"][:, 1:])
        
        for i in range(numb_points):
            if dists[i] < queue.peektop()[0]:
                queue.heappushpop(np.asscalar(dists[i]), curr_node["data"][i, :])
    

    # Not Leaf So Explore Children
    else:
        
        left_index = 2 * index + 1
        right_index = left_index + 1
       
        if centroid_dists[left_index] < centroid_dists[right_index]:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
        else:
            query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
            query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)

In [117]:
def classify_ball_tree_cyth(test_data, train_data, labels, k, tree, centroids):
    
    centroid_dists = euclidean(centroids, test_data)
    
    
    output_labels = []
    for i, test_vector in enumerate(test_data):
        queue = PriorityQueue(False)
        # Fill queue With High Distance Points
        list(map(lambda x: queue.heappush(9e10, np.array([9e10, 9e10])), range(k)))
        query_ball_tree_cyth(test_vector[np.newaxis, :], k, queue, tree, centroid_dists[i, :], 0)
        
        nn_points = np.array([x[2][0] for x in queue.queue], dtype=np.int)
        
        
        
        predicted_labels = labels[nn_points]
        
        output_labels.append(np.bincount(predicted_labels).argmax())
    return np.array(output_labels)

In [21]:
leaf_size = 1
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
numb_of_nodes = int(2 ** tree_height) - 1 
centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))

ball_tree = create_ball_tree(train_data_with_inds, leaf_size, centroids_s, 0)




%timeit result = classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree, centroids_s)

192 ms ± 7.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Removed peektop

In [123]:
%%cython -a
import cython
import numpy as np
cimport numpy as np
from knn.distance_metrics import euclidean



@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void query_ball_tree_cyth(np.ndarray[double, ndim=2] target_vect, int k, queue, dict curr_node, np.ndarray[double, ndim=1] centroid_dists, int index):

    cdef size_t i
    cdef int numb_points
    cdef np.ndarray[double, ndim=2] dists
    cdef int left_index, right_index
    
    # Prune This Ball
    if centroid_dists[index] - curr_node["radius"] < -1*queue.queue[0][0]:
        
        # Is Leaf
        if "data" in curr_node:

            numb_points = curr_node["data"].shape[0]
            dists = euclidean(target_vect, curr_node["data"][:, 1:])

            for i in range(numb_points):
                if dists[i] < -1*queue.queue[0][0]:
                    queue.heappushpop(np.asscalar(dists[i]), curr_node["data"][i, :])


        # Not Leaf So Explore Children
        else:

            left_index = 2 * index + 1
            right_index = left_index + 1

            if centroid_dists[left_index] < centroid_dists[right_index]:
                query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)
                query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
            else:
                query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
                query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)

In [127]:
leaf_size = 10
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
numb_of_nodes = int(2 ** tree_height) - 1 
centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))

ball_tree = create_ball_tree(train_data_with_inds, leaf_size, centroids_s, 0)




%timeit result = classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree, centroids_s)

98.5 ms ± 1.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [146]:
%%cython -a
import cython
from cpython cimport bool

from collections import Counter
import heapq

#Wrapper Over heapq
cdef class PriorityQueue:
    
    cdef public list queue
    cdef public object cnt
    cdef public bool is_min_heap

    def __init__(self, is_min_heap=True):
        self.queue = []
        self.cnt = Counter()
        self.is_min_heap = is_min_heap

    cpdef heappush(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappush(self.queue, (q_priority, self.cnt[priority], value))

    cpdef heappushpop(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappushpop(self.queue, (q_priority, self.cnt[priority], value))

    cpdef heappop(self):
        curr_top = heapq.heappop(self.queue)
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]

    cpdef peektop(self):
        curr_top = self.queue[0]
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]

In [148]:
test= PriorityQueue()
test.heappush(1, 5)
print(test.queue)

[(1, 1, 5)]


In [151]:
%%cython -a
import cython
import numpy as np
cimport numpy as np
from knn.distance_metrics import euclidean

from collections import Counter
import heapq
from cpython cimport bool

#Wrapper Over heapq
cdef class PriorityQueue:
    
    cdef public list queue
    cdef public object cnt
    cdef public bool is_min_heap

    def __init__(self, is_min_heap=True):
        self.queue = []
        self.cnt = Counter()
        self.is_min_heap = is_min_heap

    cpdef heappush(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappush(self.queue, (q_priority, self.cnt[priority], value))

    cpdef heappushpop(self, priority, value):
        self.cnt[priority] += 1
        q_priority = priority if self.is_min_heap else -1*priority
        heapq.heappushpop(self.queue, (q_priority, self.cnt[priority], value))

    cpdef heappop(self):
        curr_top = heapq.heappop(self.queue)
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]

    cpdef peektop(self):
        curr_top = self.queue[0]
        q_priority = curr_top[0] if self.is_min_heap else -1*curr_top[0]
        return q_priority, curr_top[2]



@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void query_ball_tree_cyth(np.ndarray[double, ndim=2] target_vect, int k, PriorityQueue queue,
                                dict curr_node, np.ndarray[double, ndim=1] centroid_dists, int index):

    cdef size_t i
    cdef int numb_points
    cdef np.ndarray[double, ndim=2] dists
    cdef int left_index, right_index
    
    # Prune This Ball
    if centroid_dists[index] - curr_node["radius"] < -1*queue.queue[0][0]:
        
        # Is Leaf
        if "data" in curr_node:

            numb_points = curr_node["data"].shape[0]
            dists = euclidean(target_vect, curr_node["data"][:, 1:])

            for i in range(numb_points):
                if dists[i] < -1*queue.queue[0][0]:
                    queue.heappushpop(np.asscalar(dists[i]), curr_node["data"][i, :])


        # Not Leaf So Explore Children
        else:

            left_index = 2 * index + 1
            right_index = left_index + 1

            if centroid_dists[left_index] < centroid_dists[right_index]:
                query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)
                query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
            else:
                query_ball_tree_cyth(target_vect, k, queue, curr_node["right_child"], centroid_dists,right_index)
                query_ball_tree_cyth(target_vect, k, queue, curr_node["left_child"], centroid_dists,left_index)

In [153]:
leaf_size = 10
numb_of_leafs = math.ceil(train_data.shape[0] / leaf_size)
tree_height =  1 + math.ceil(np.log2(numb_of_leafs))
numb_of_nodes = int(2 ** tree_height) - 1 
centroids_s = np.zeros((numb_of_nodes, test_data.shape[1]))

ball_tree = create_ball_tree(train_data_with_inds, leaf_size, centroids_s, 0)

%timeit result = classify_ball_tree_cyth(train_data, train_data, train_labels, 3, ball_tree, centroids_s)

97.3 ms ± 2.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
