# Vectorized Distance Metrics


In [1]:
# Create Some Basic Testing Data
import numpy as np

train_data = np.array([[1., 2., 3.], [5., 6., 7.], [8., 9., 10.]])
test_data = np.array([[5., 10., 15.], [10., 20., 30.]])

print("Train Data: \n" + str(train_data))
print("Test Data: \n" + str(test_data))

Train Data: 
[[ 1.  2.  3.]
 [ 5.  6.  7.]
 [ 8.  9. 10.]]
Test Data: 
[[ 5. 10. 15.]
 [10. 20. 30.]]


## Euclidean

In [2]:
# Find Difference Between Train and Test Vectors
diff = train_data-test_data[:, np.newaxis]
print("Difference Of Vectors: \n" + str(diff))

Difference Of Vectors: 
[[[ -4.  -8. -12.]
  [  0.  -4.  -8.]
  [  3.  -1.  -5.]]

 [[ -9. -18. -27.]
  [ -5. -14. -23.]
  [ -2. -11. -20.]]]


In [3]:
squares = np.square(diff)
print("Square Of Differences: \n" + str(squares))

Square Of Differences: 
[[[ 16.  64. 144.]
  [  0.  16.  64.]
  [  9.   1.  25.]]

 [[ 81. 324. 729.]
  [ 25. 196. 529.]
  [  4. 121. 400.]]]


In [4]:
sum_of_squares = np.sum(np.square(squares), axis=2)
print("Sum Of Squares: \n" + str(sum_of_squares))

Sum Of Squares: 
[[ 25088.   4352.    707.]
 [642978. 318882. 174657.]]


In [5]:
distance_matrix = np.sqrt(np.sum(np.square(train_data-test_data[:, np.newaxis]), axis=2))
print("Distance Matrix:\n" + str(distance_matrix))

Distance Matrix:
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]


In [6]:
from knn.distance_metrics import euclidean

rand_train = np.random.randn(100,10)
rand_test = np.random.randn(100,10)

def euclidean_vect(train_data, test_data):
    return np.sqrt(np.sum(np.square(train_data-test_data[:, np.newaxis]), axis=2))

def euclidean_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = euclidean(train_vect, test_vect)
    return distance_matrix


print("Vectorized Result: \n" + str(euclidean_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(euclidean_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = euclidean_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = euclidean_no_vect(rand_train, rand_test)

Vectorized Result: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Non-Vectorized Result: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Vectorized Timing:
CPU times: user 1.54 ms, sys: 875 µs, total: 2.41 ms
Wall time: 1.98 ms
Non-Vectorized Timing:
CPU times: user 47.3 ms, sys: 1.2 ms, total: 48.5 ms
Wall time: 50.6 ms


This definitely works, but relies heavily on broadcasting. We can expand the euclidean distance formula and avoid a majority of the broadcasting: sum((x-y)^2) = sum(x^2)+sum(y^2)-2sum(xy).

In [7]:
# X^2 Term
x_squared_term = np.sum(np.square(test_data), axis=1)
print("sum(x^2) Term:\n" + str(x_squared_term))

# Y^2 Term
y_squared_term = np.sum(np.square(train_data), axis=1)
print("sum(y^2) Term:\n" + str(y_squared_term))

sum(x^2) Term:
[ 350. 1400.]
sum(y^2) Term:
[ 14. 110. 245.]


In [8]:
#-2 * XY Term
x_by_y = -2*np.dot(test_data, train_data.T)
print("-2sum(xy) Term:\n" + str(x_by_y))

-2sum(xy) Term:
[[ -140.  -380.  -560.]
 [ -280.  -760. -1120.]]


In [9]:
# Only Broad Casting Needed
x_plus_y = x_squared_term[:, np.newaxis] + y_squared_term
print("sum(x^2)+sum(y^2) Term:\n" + str(x_by_y))

sum(x^2)+sum(y^2) Term:
[[ -140.  -380.  -560.]
 [ -280.  -760. -1120.]]


In [10]:
distance = np.sqrt(x_squared_term[:, np.newaxis] + y_squared_term + x_by_y)
print("Distance:\n" + str(distance))

Distance:
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]


In [11]:
def euclidean_vect_less_broad(train_data, test_data):
    return np.sqrt(np.sum(np.square(test_data), axis=1)[:, np.newaxis] + np.sum(np.square(train_data), axis=1) - 2*np.dot(test_data, train_data.T))



print("Vectorized Result: \n" + str(euclidean_vect(train_data, test_data)))
print("Vectorized Result Less Broadcasting: \n" + str(euclidean_vect_less_broad(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(euclidean_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = euclidean_vect(rand_train, rand_test)
print("Vectorized Timing Less Broadcasting:")
%time result = euclidean_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = euclidean_no_vect(rand_train, rand_test)

Vectorized Result: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Vectorized Result Less Broadcasting: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Non-Vectorized Result: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Vectorized Timing:
CPU times: user 1.58 ms, sys: 392 µs, total: 1.97 ms
Wall time: 1.49 ms
Vectorized Timing Less Broadcasting:
CPU times: user 739 µs, sys: 65 µs, total: 804 µs
Wall time: 666 µs
Non-Vectorized Timing:
CPU times: user 39.1 ms, sys: 213 µs, total: 39.3 ms
Wall time: 39.5 ms


## L1-Norm or Manhattan Distance

In [12]:
# Find Difference Between Train and Test Vectors
diff = train_data-test_data[:, np.newaxis]
print("Difference Of Vectors: \n" + str(diff))

Difference Of Vectors: 
[[[ -4.  -8. -12.]
  [  0.  -4.  -8.]
  [  3.  -1.  -5.]]

 [[ -9. -18. -27.]
  [ -5. -14. -23.]
  [ -2. -11. -20.]]]


In [13]:
absolute_diff = np.abs(diff)
print("Absolute Differences: \n" + str(absolute_diff))

Absolute Differences: 
[[[ 4.  8. 12.]
  [ 0.  4.  8.]
  [ 3.  1.  5.]]

 [[ 9. 18. 27.]
  [ 5. 14. 23.]
  [ 2. 11. 20.]]]


In [14]:
distance_matrix = np.sum(absolute_diff, axis=2)
print("Distance Matrix:\n" + str(distance_matrix))

Distance Matrix:
[[24. 12.  9.]
 [54. 42. 33.]]


In [15]:
from knn.distance_metrics import manhattan

rand_train = np.random.randn(100,10)
rand_test = np.random.randn(100,10)

def manhattan_vect(train_data, test_data):
    return np.sum(np.abs(train_data-test_data[:, np.newaxis]), axis=2)

def manhattan_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = manhattan(train_vect, test_vect)
    return distance_matrix


print("Vectorized Result: \n" + str(manhattan_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(manhattan_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = euclidean_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = euclidean_no_vect(rand_train, rand_test)

Vectorized Result: 
[[24. 12.  9.]
 [54. 42. 33.]]
Non-Vectorized Result: 
[[24. 12.  9.]
 [54. 42. 33.]]
Vectorized Timing:
CPU times: user 1.15 ms, sys: 168 µs, total: 1.32 ms
Wall time: 772 µs
Non-Vectorized Timing:
CPU times: user 46 ms, sys: 1.84 ms, total: 47.8 ms
Wall time: 48.7 ms


## Hamming Distance

In [16]:
differences = train_data != test_data[:, np.newaxis]
print("Logical Differences: \n" + str(differences))

Logical Differences: 
[[[ True  True  True]
  [False  True  True]
  [ True  True  True]]

 [[ True  True  True]
  [ True  True  True]
  [ True  True  True]]]


In [17]:
sum_of_diffs = np.sum(np.abs(train_data != test_data[:, np.newaxis]), axis=2)
print("Sum Of Differences: \n" + str(sum_of_diffs))

Sum Of Differences: 
[[3 2 3]
 [3 3 3]]


In [18]:
from knn.distance_metrics import hamming

rand_train = np.random.randn(100,10)
rand_test = np.random.randn(100,10)

def hamming_vect(train_data, test_data):
    return np.sum(np.abs(train_data != test_data[:, np.newaxis]), axis=2)

def hamming_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = hamming(train_vect, test_vect)
    return distance_matrix


print("Vectorized Result: \n" + str(hamming_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(hamming_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = hamming_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = hamming_no_vect(rand_train, rand_test)

Vectorized Result: 
[[3 2 3]
 [3 3 3]]
Non-Vectorized Result: 
[[3. 2. 3.]
 [3. 3. 3.]]
Vectorized Timing:
CPU times: user 1.25 ms, sys: 785 µs, total: 2.03 ms
Wall time: 1.04 ms
Non-Vectorized Timing:
CPU times: user 79.7 ms, sys: 1.02 ms, total: 80.8 ms
Wall time: 82.8 ms


## Cosine Sim

In [19]:
train_norms = np.sum(np.square(train_data), axis=1)
test_norms = np.sum(np.square(test_data), axis=1)
print("Train Vector Norms:\n" + str(train_norms))
print("Test Vector Norms:\n" + str(test_norms))

Train Vector Norms:
[ 14. 110. 245.]
Test Vector Norms:
[ 350. 1400.]


In [20]:
prod_of_norms = np.outer(train_norms, test_norms)
print("Product Of Norms:\n" + str(prod_of_norms))

Product Of Norms:
[[  4900.  19600.]
 [ 38500. 154000.]
 [ 85750. 343000.]]


In [21]:
dots = np.dot(train_data, test_data.T)
print("Dot Products:\n" + str(dots))

Dot Products:
[[ 70. 140.]
 [190. 380.]
 [280. 560.]]


In [22]:
similarity_matrix = dots/prod_of_norms
print("Similarity Matrix:\n" + str(similarity_matrix))

Similarity Matrix:
[[0.01428571 0.00714286]
 [0.00493506 0.00246753]
 [0.00326531 0.00163265]]


In [23]:
from knn.distance_metrics import cosine

def cosine_vect(train_data, test_data):
    train_norms = np.sqrt(np.sum(np.square(train_data), axis=1))
    test_norms = np.sqrt(np.sum(np.square(test_data), axis=1))
    return 1-(np.dot(train_data, test_data.T)/np.outer(train_norms, test_norms)).T

def cosine_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = cosine(train_vect, test_vect)
    return distance_matrix
            
            
print("Vectorized Result: \n" + str(cosine_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(cosine_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = cosine_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = cosine_no_vect(rand_train, rand_test)

Vectorized Result: 
[[0.         0.03167034 0.04381711]
 [0.         0.03167034 0.04381711]]
Non-Vectorized Result: 
[[0.         0.03167034 0.04381711]
 [0.         0.03167034 0.04381711]]
Vectorized Timing:
CPU times: user 1.4 ms, sys: 478 µs, total: 1.88 ms
Wall time: 1.48 ms
Non-Vectorized Timing:
CPU times: user 79.2 ms, sys: 1.82 ms, total: 81 ms
Wall time: 81 ms


# Correlation

In [24]:
mean_removed_train = train_data-np.mean(train_data, axis=1)[:, np.newaxis]
mean_removed_test = test_data-np.mean(test_data, axis=1)[:, np.newaxis,]

print("Mean Removed Train:\n" + str(mean_removed_train))
print("Mean Removed Test:\n" + str(mean_removed_test))

Mean Removed Train:
[[-1.  0.  1.]
 [-1.  0.  1.]
 [-1.  0.  1.]]
Mean Removed Test:
[[ -5.   0.   5.]
 [-10.   0.  10.]]


In [25]:
std_dev_train = np.sqrt(np.sum(np.square(mean_removed_train), axis=1))
print("Std Dev Train:\n" + str(std_dev_train))

std_dev_test = np.sqrt(np.sum(np.square(mean_removed_test), axis=1))
print("Std Dev Train:\n" + str(std_dev_test))

Std Dev Train:
[1.41421356 1.41421356 1.41421356]
Std Dev Train:
[ 7.07106781 14.14213562]


In [26]:
cov = np.dot(mean_removed_train, mean_removed_test.T)
print("Covariance Matrix:\n" + str(cov))

Covariance Matrix:
[[10. 20.]
 [10. 20.]
 [10. 20.]]


In [27]:
std_dev_dot = np.outer(std_dev_train, std_dev_test)
print("Std Dev Dots:\n" + str(std_dev_dot))

Std Dev Dots:
[[10. 20.]
 [10. 20.]
 [10. 20.]]


In [28]:
correlations = cov/std_dev_dot
print("Correlations:\n" + str(correlations))

Correlations:
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [29]:
from knn.distance_metrics import pearson

def pearson_vect(train_data, test_data):
    mean_removed_train = (train_data-np.mean(train_data, axis=1)[:, np.newaxis])
    mean_removed_test = (test_data-np.mean(test_data, axis=1)[:, np.newaxis])
    std_dev_train = np.sqrt(np.sum(np.square(mean_removed_train), axis=1))
    std_dev_test = np.sqrt(np.sum(np.square(mean_removed_test), axis=1))
    return 1-np.dot(mean_removed_train, mean_removed_test.T)/np.outer(std_dev_train, std_dev_test)

def pearson_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = pearson(train_vect, test_vect)
    return distance_matrix



print("Vectorized Result: \n" + str(pearson_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(pearson_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = pearson_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = pearson_no_vect(rand_train, rand_test)

Vectorized Result: 
[[2.22044605e-16 2.22044605e-16]
 [2.22044605e-16 2.22044605e-16]
 [2.22044605e-16 2.22044605e-16]]
Non-Vectorized Result: 
[[2.22044605e-16 2.22044605e-16 2.22044605e-16]
 [2.22044605e-16 2.22044605e-16 2.22044605e-16]]
Vectorized Timing:
CPU times: user 1.1 ms, sys: 208 µs, total: 1.31 ms
Wall time: 1.31 ms
Non-Vectorized Timing:
CPU times: user 302 ms, sys: 14.3 ms, total: 316 ms
Wall time: 308 ms


## Chi Squared

In [30]:
all_col_sum = train_data + test_data[:, np.newaxis]
print("All Column Sums:\n" + str(all_col_sum))

All Column Sums:
[[[ 6. 12. 18.]
  [10. 16. 22.]
  [13. 19. 25.]]

 [[11. 22. 33.]
  [15. 26. 37.]
  [18. 29. 40.]]]


In [31]:
all_col_sum_recip = np.reciprocal(all_col_sum, where=(all_col_sum != 0.0))
print("All Column Sums Reciprocal:\n" + str(all_col_sum_recip))

All Column Sums Reciprocal:
[[[0.16666667 0.08333333 0.05555556]
  [0.1        0.0625     0.04545455]
  [0.07692308 0.05263158 0.04      ]]

 [[0.09090909 0.04545455 0.03030303]
  [0.06666667 0.03846154 0.02702703]
  [0.05555556 0.03448276 0.025     ]]]


In [32]:
vector_train_sum = np.sum(train_data, axis=1)
print("Sum Of Train Vectors:\n" + str(vector_train_sum))
vector_test_sum = np.sum(test_data, axis=1)
print("Sum Of Test Vectors:\n" + str(vector_test_sum))

Sum Of Train Vectors:
[ 6. 18. 27.]
Sum Of Test Vectors:
[30. 60.]


In [33]:
rel_freq_train = train_data/vector_train_sum[:, np.newaxis]
print("Relative Freq Train Vectors:\n" + str(rel_freq_train))

rel_freq_test = test_data/vector_test_sum[:, np.newaxis]
print("Relative Freq Test Vectors:\n" + str(rel_freq_test))

Relative Freq Train Vectors:
[[0.16666667 0.33333333 0.5       ]
 [0.27777778 0.33333333 0.38888889]
 [0.2962963  0.33333333 0.37037037]]
Relative Freq Test Vectors:
[[0.16666667 0.33333333 0.5       ]
 [0.16666667 0.33333333 0.5       ]]


In [34]:
diff_rel_freq = rel_freq_train-rel_freq_test[:, np.newaxis]
print("Difference Between Relative Freq of Vectors:\n" + str(diff_rel_freq))

diff_rel_freq_square = np.square(diff_rel_freq)
print("Difference Between Relative Freq of Vectors Squared:\n" + str(diff_rel_freq_square))

Difference Between Relative Freq of Vectors:
[[[ 0.          0.          0.        ]
  [ 0.11111111  0.         -0.11111111]
  [ 0.12962963  0.         -0.12962963]]

 [[ 0.          0.          0.        ]
  [ 0.11111111  0.         -0.11111111]
  [ 0.12962963  0.         -0.12962963]]]
Difference Between Relative Freq of Vectors Squared:
[[[0.         0.         0.        ]
  [0.01234568 0.         0.01234568]
  [0.01680384 0.         0.01680384]]

 [[0.         0.         0.        ]
  [0.01234568 0.         0.01234568]
  [0.01680384 0.         0.01680384]]]


In [35]:
recips_prod_with_diffs_rel_freq = all_col_sum_recip * diff_rel_freq_square
print("Product of Col Sums Recips and Diffs Of Rel Freq Squared:\n" + str(recips_prod_with_diffs_rel_freq))

sums_of_recips_prod_diffs_rel_freq = np.sum(all_col_sum_recip * diff_rel_freq_square, axis=2)
print("Sums of Products of Col Sums Recips and Diffs Of Rel Freq Squared:\n" + str(sums_of_recips_prod_diffs_rel_freq))

chisqr = np.sqrt(np.sum(all_col_sum_recip * diff_rel_freq_square, axis=2))
print("Chi Squared Stats: \n" + str(chisqr))

Product of Col Sums Recips and Diffs Of Rel Freq Squared:
[[[0.         0.         0.        ]
  [0.00123457 0.         0.00056117]
  [0.0012926  0.         0.00067215]]

 [[0.         0.         0.        ]
  [0.00082305 0.         0.00033367]
  [0.00093355 0.         0.0004201 ]]]
Sums of Products of Col Sums Recips and Diffs Of Rel Freq Squared:
[[0.         0.00179574 0.00196476]
 [0.         0.00115671 0.00135364]]
Chi Squared Stats: 
[[0.         0.04237612 0.04432558]
 [0.         0.03401047 0.03679188]]


In [36]:
from knn.distance_metrics import chisqr

rand_train = np.random.randint(11, size=(100, 10))
rand_test = np.random.randint(11, size=(100, 10))


def chisqr_vect(train_data, test_data):
    all_col_sum = train_data + test_data[:, np.newaxis]
    all_col_sum_recip = np.reciprocal(all_col_sum, where=(all_col_sum != 0.0))
    vector_train_sum = np.sum(train_data, axis=1)
    vector_test_sum = np.sum(test_data, axis=1)
    
    rel_freq_train = np.divide(train_data, vector_train_sum[:, np.newaxis], 
                               out=np.full([train_data.shape[0], train_data.shape[1]], np.nan),
                               where=(vector_train_sum[:, np.newaxis] != 0))
    
    rel_freq_test = np.divide(test_data, vector_test_sum[:, np.newaxis], 
                               out=np.full([test_data.shape[0], test_data.shape[1]], np.nan),
                               where=(vector_test_sum[:, np.newaxis] != 0))
    
    diff_rel_freq_squared = np.square(rel_freq_train-rel_freq_test[:, np.newaxis])
    chisqr = np.sqrt(np.sum(all_col_sum_recip * diff_rel_freq_squared, axis=2))
    return chisqr

def chisqr_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = chisqr(train_vect, test_vect)
    return distance_matrix



print("Vectorized Result: \n" + str(chisqr_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(chisqr_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = chisqr_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = chisqr_no_vect(rand_train, rand_test)

Vectorized Result: 
[[0.         0.04237612 0.04432558]
 [0.         0.03401047 0.03679188]]
Non-Vectorized Result: 
[[0.         0.04237612 0.04432558]
 [0.         0.03401047 0.03679188]]
Vectorized Timing:
CPU times: user 3.35 ms, sys: 1.76 ms, total: 5.11 ms
Wall time: 3.08 ms
Non-Vectorized Timing:
CPU times: user 321 ms, sys: 23.6 ms, total: 344 ms
Wall time: 328 ms


## Benchmarking With KNN

In [37]:
rand_train = np.random.randn(10,10)
rand_test = np.random.randn(10,10)

In [38]:
def knn_bf_introselect(train, test , distance, k=1):
    distance_matrix = distance(train, test)
    k_smallest_ind = np.argpartition(distance_matrix, k-1)[:,:k] 
    return k_smallest_ind, np.array([distance_matrix[i, x] for i, x in enumerate(k_smallest_ind)])
    

print("KNN - Introselect k=1: \n" + str(knn_bf_introselect(train_data, test_data, euclidean_vect , 2)))

%timeit -r 15 result_1 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 1);
%timeit -r 15 result_2 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 3);
%timeit -r 15 result_3 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 5);
%timeit -r 15 result_4 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 7);

KNN - Introselect k=1: 
(array([[2, 1],
       [2, 1]]), array([[ 5.91607978,  8.94427191],
       [22.91287847, 27.38612788]]))
42.1 µs ± 799 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)
42.3 µs ± 963 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)
42.7 µs ± 880 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)
42.7 µs ± 835 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)


In [39]:
import heapq

def knn_bf_max_heap(train, test, distance, k=1):
    distance_matrix = distance(train, test)
    
    smallest_k_matrix = np.zeros((test.shape[0], k), dtype=[("index",int), ("dist",float)])
    indices_list = []
    distances_list = []
    for index, row in enumerate(distance_matrix):
        heapq.heapify(row.tolist())
        top_k = heapq.nsmallest(k, enumerate(row), key=lambda x: x[1])
        indices_list.append([item[0] for item in top_k])
        distances_list.append([item[1] for item in top_k])
        
    return np.array(indices_list), np.array(distances_list)
    
    
print("KNN - Introselect k=1: \n" + str(knn_bf_max_heap(train_data, test_data, euclidean_vect , 2)))
%timeit -r 15 result_1 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 1);
%timeit -r 15 result_2 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 3);
%timeit -r 15 result_3 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 5);
%timeit -r 15 result_4 = knn_bf_introselect(rand_train, rand_test, euclidean_vect, 7);

KNN - Introselect k=1: 
(array([[2, 1],
       [2, 1]]), array([[ 5.91607978,  8.94427191],
       [22.91287847, 27.38612788]]))
42.1 µs ± 582 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)
42.4 µs ± 947 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)
42.6 µs ± 934 ns per loop (mean ± std. dev. of 15 runs, 10000 loops each)
42.9 µs ± 1.16 µs per loop (mean ± std. dev. of 15 runs, 10000 loops each)
