# Vectorized Distance Metrics


In [388]:
# Create Some Basic Testing Data
import numpy as np

train_data = np.array([[1., 2., 3.], [5., 6., 7.], [8., 9., 10.]])
test_data = np.array([[5., 10., 15.], [10., 20., 30.]])

print("Train Data: \n" + str(train_data))
print("Test Data: \n" + str(test_data))

Train Data: 
[[ 1.  2.  3.]
 [ 5.  6.  7.]
 [ 8.  9. 10.]]
Test Data: 
[[ 5. 10. 15.]
 [10. 20. 30.]]


## Euclidean

In [245]:
# Find Difference Between Train and Test Vectors
diff = train_data-test_data[:, np.newaxis]
print("Difference Of Vectors: \n" + str(diff))

Difference Of Vectors: 
[[[ -4  -8 -12]
  [  0  -4  -8]
  [  3  -1  -5]]

 [[ -9 -18 -27]
  [ -5 -14 -23]
  [ -2 -11 -20]]]


In [246]:
squares = np.square(diff)
print("Square Of Differences: \n" + str(squares))

Square Of Differences: 
[[[ 16  64 144]
  [  0  16  64]
  [  9   1  25]]

 [[ 81 324 729]
  [ 25 196 529]
  [  4 121 400]]]


In [247]:
sum_of_squares = np.sum(np.square(test), axis=2)
print("Sum Of Squares: \n" + str(sum_of_squares))

Sum Of Squares: 
[[ 224   80   35]
 [1134  750  525]]


In [248]:
distance_matrix = np.sqrt(np.sum(np.square(test), axis=2))
print("Distance Matrix:\n" + str(distance_matrix))

Distance Matrix:
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]


In [258]:
from knn.distance_metrics import euclidean

rand_train = np.random.randn(100,10)
rand_test = np.random.randn(100,10)

def euclidean_vect(train_data, test_data):
    return np.sqrt(np.sum(np.square(train_data-test_data[:, np.newaxis]), axis=2))

def euclidean_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = euclidean(train_vect, test_vect)
    return distance_matrix


print("Vectorized Result: \n" + str(euclidean_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(euclidean_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = euclidean_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = euclidean_no_vect(rand_train, rand_test)

Vectorized Result: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Non-Vectorized Result: 
[[14.96662955  8.94427191  5.91607978]
 [33.67491648 27.38612788 22.91287847]]
Vectorized Timing:
CPU times: user 1.02 ms, sys: 353 µs, total: 1.38 ms
Wall time: 904 µs
Non-Vectorized Timing:
CPU times: user 39.2 ms, sys: 1.63 ms, total: 40.8 ms
Wall time: 40 ms


## L1-Norm or Manhattan Distance

In [268]:
# Find Difference Between Train and Test Vectors
diff = train_data-test_data[:, np.newaxis]
print("Difference Of Vectors: \n" + str(diff))

Difference Of Vectors: 
[[[ -4  -8 -12]
  [  0  -4  -8]
  [  3  -1  -5]]

 [[ -9 -18 -27]
  [ -5 -14 -23]
  [ -2 -11 -20]]]


In [269]:
absolute_diff = np.abs(diff)
print("Absolute Differences: \n" + str(absolute_diff))

Absolute Differences: 
[[[ 4  8 12]
  [ 0  4  8]
  [ 3  1  5]]

 [[ 9 18 27]
  [ 5 14 23]
  [ 2 11 20]]]


In [270]:
distance_matrix = np.sum(absolute_diff, axis=2)
print("Distance Matrix:\n" + str(distance_matrix))

Distance Matrix:
[[24 12  9]
 [54 42 33]]


In [274]:
from knn.distance_metrics import manhattan

rand_train = np.random.randn(100,10)
rand_test = np.random.randn(100,10)

def manhattan_vect(train_data, test_data):
    return np.sum(np.abs(train_data-test_data[:, np.newaxis]), axis=2)

def manhattan_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = manhattan(train_vect, test_vect)
    return distance_matrix


print("Vectorized Result: \n" + str(manhattan_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(manhattan_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = euclidean_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = euclidean_no_vect(rand_train, rand_test)

Vectorized Result: 
[[24 12  9]
 [54 42 33]]
Non-Vectorized Result: 
[[24. 12.  9.]
 [54. 42. 33.]]
Vectorized Timing:
CPU times: user 1.23 ms, sys: 294 µs, total: 1.52 ms
Wall time: 866 µs
Non-Vectorized Timing:
CPU times: user 40.8 ms, sys: 1.12 ms, total: 41.9 ms
Wall time: 41.2 ms


## Hamming Distance

In [279]:
differences = train_data != test_data[:, np.newaxis]
print("Logical Differences: \n" + str(differences))

[[[ True  True  True]
  [False  True  True]
  [ True  True  True]]

 [[ True  True  True]
  [ True  True  True]
  [ True  True  True]]]


In [281]:
sum_of_diffs = np.sum(np.abs(train_data != test_data[:, np.newaxis]), axis=2)
print("Sum Of Differences: \n" + str(sum_of_diffs))

Sum Of Differences: 
[[3 2 3]
 [3 3 3]]


In [284]:
from knn.distance_metrics import hamming

rand_train = np.random.randn(100,10)
rand_test = np.random.randn(100,10)

def hamming_vect(train_data, test_data):
    return np.sum(np.abs(train_data != test_data[:, np.newaxis]), axis=2)

def hamming_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = hamming(train_vect, test_vect)
    return distance_matrix


print("Vectorized Result: \n" + str(hamming_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(hamming_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = hamming_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = hamming_no_vect(rand_train, rand_test)

Vectorized Result: 
[[3 2 3]
 [3 3 3]]
Non-Vectorized Result: 
[[3. 2. 3.]
 [3. 3. 3.]]
Vectorized Timing:
CPU times: user 1.83 ms, sys: 1.1 ms, total: 2.93 ms
Wall time: 1.51 ms
Non-Vectorized Timing:
CPU times: user 70.8 ms, sys: 745 µs, total: 71.5 ms
Wall time: 72.7 ms


## Cosine Sim

In [290]:
train_norms = np.sum(np.square(train_data), axis=1)
test_norms = np.sum(np.square(test_data), axis=1)
print("Train Vector Norms:\n" + str(train_norms))
print("Test Vector Norms:\n" + str(test_norms))

Train Vector Norms:
[ 14 110 245]
Test Vector Norms:
[ 350 1400]


In [292]:
prod_of_norms = np.outer(train_norms, test_norms)
print("Product Of Norms:\n" + str(prod_of_norms))

Product Of Norms:
[[  4900  19600]
 [ 38500 154000]
 [ 85750 343000]]


In [293]:
dots = np.dot(train_data, test_data.T)
print("Dot Products:\n" + str(dots))

Dot Products:
[[ 70 140]
 [190 380]
 [280 560]]


In [294]:
similarity_matrix = dots/norm_prods
print("Similarity Matrix:\n" + str(similarity_matrix))

Similarity Matrix:
[[0.01428571 0.00714286]
 [0.00493506 0.00246753]
 [0.00326531 0.00163265]]


In [312]:
from knn.distance_metrics import cosine

def cosine_vect(train_data, test_data):
    train_norms = np.sqrt(np.sum(np.square(train_data), axis=1))
    test_norms = np.sqrt(np.sum(np.square(test_data), axis=1))
    return 1-(np.dot(train_data, test_data.T)/np.outer(train_norms, test_norms)).T

def cosine_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = cosine(train_vect, test_vect)
    return distance_matrix
            
            
print("Vectorized Result: \n" + str(cosine_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(cosine_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = cosine_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = cosine_no_vect(rand_train, rand_test)

Vectorized Result: 
[[0.         0.03167034 0.04381711]
 [0.         0.03167034 0.04381711]]
Non-Vectorized Result: 
[[0.         0.03167034 0.04381711]
 [0.         0.03167034 0.04381711]]
Vectorized Timing:
CPU times: user 713 µs, sys: 323 µs, total: 1.04 ms
Wall time: 727 µs
Non-Vectorized Timing:
CPU times: user 81 ms, sys: 1.31 ms, total: 82.3 ms
Wall time: 82.6 ms


# Correlation

In [298]:
mean_removed_train = train_data-np.mean(train_data, axis=1)[:, np.newaxis]
mean_removed_test = test_data-np.mean(test_data, axis=1)[:, np.newaxis,]

print("Mean Removed Train:\n" + str(mean_removed_train))
print("Mean Removed Test:\n" + str(mean_removed_test))

Mean Removed Train:
[[-1.  0.  1.]
 [-1.  0.  1.]
 [-1.  0.  1.]]
Mean Removed Test:
[[ -5.   0.   5.]
 [-10.   0.  10.]]


In [304]:
std_dev_train = np.sqrt(np.sum(np.square(mean_removed_train), axis=1))
print("Std Dev Train:\n" + str(std_dev_a))

std_dev_test = np.sqrt(np.sum(np.square(mean_removed_test), axis=1))
print("Std Dev Train:\n" + str(std_dev_b))

Std Dev Train:
[1.41421356 1.41421356 1.41421356]
Std Dev Train:
[ 7.07106781 14.14213562]


In [305]:
cov = np.dot(mean_removed_train, mean_removed_test.T)
print("Covariance Matrix:\n" + str(cov))

Covariance Matrix:
[[10. 20.]
 [10. 20.]
 [10. 20.]]


In [307]:
std_dev_dot = np.outer(std_dev_train, std_dev_test)
print("Std Dev Dots:\n" + str(std_dev_dot))

Std Dev Dots:
[[10. 20.]
 [10. 20.]
 [10. 20.]]


In [308]:
correlations = cov/std_dev_dot
print("Correlations:\n" + str(correlations))

Correlations:
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [314]:
from knn.distance_metrics import pearson

def pearson_vect(train_data, test_data):
    mean_removed_train = (train_data-np.mean(train_data, axis=1)[:, np.newaxis])
    mean_removed_test = (test_data-np.mean(test_data, axis=1)[:, np.newaxis])
    std_dev_train = np.sqrt(np.sum(np.square(mean_removed_train), axis=1))
    std_dev_test = np.sqrt(np.sum(np.square(mean_removed_test), axis=1))
    return 1-np.dot(mean_removed_train, mean_removed_test.T)/np.outer(std_dev_train, std_dev_test)

def pearson_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = pearson(train_vect, test_vect)
    return distance_matrix



print("Vectorized Result: \n" + str(pearson_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(pearson_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = pearson_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = pearson_no_vect(rand_train, rand_test)

Vectorized Result: 
[[2.22044605e-16 2.22044605e-16]
 [2.22044605e-16 2.22044605e-16]
 [2.22044605e-16 2.22044605e-16]]
Non-Vectorized Result: 
[[2.22044605e-16 2.22044605e-16 2.22044605e-16]
 [2.22044605e-16 2.22044605e-16 2.22044605e-16]]
Vectorized Timing:
CPU times: user 396 µs, sys: 222 µs, total: 618 µs
Wall time: 405 µs
Non-Vectorized Timing:
CPU times: user 322 ms, sys: 10.5 ms, total: 332 ms
Wall time: 328 ms


## Chi Squared

In [397]:
all_col_sum = train_data + test_data[:, np.newaxis]
print("All Column Sums:\n" + str(all_col_sum))

[[ 1.  2.  3.]
 [ 5.  6.  7.]
 [ 8.  9. 10.]]
[[ 5. 10. 15.]
 [10. 20. 30.]]
All Column Sums:
[[[ 6. 12. 18.]
  [10. 16. 22.]
  [13. 19. 25.]]

 [[11. 22. 33.]
  [15. 26. 37.]
  [18. 29. 40.]]]


In [398]:
all_col_sum_recip = np.reciprocal(all_col_sum, where=(all_col_sum != 0.0))
print("All Column Sums Reciprocal:\n" + str(all_col_sum_recip))

All Column Sums Reciprocal:
[[[0.16666667 0.08333333 0.05555556]
  [0.1        0.0625     0.04545455]
  [0.07692308 0.05263158 0.04      ]]

 [[0.09090909 0.04545455 0.03030303]
  [0.06666667 0.03846154 0.02702703]
  [0.05555556 0.03448276 0.025     ]]]


In [402]:
vector_train_sum = np.sum(train_data, axis=1)
print("Sum Of Train Vectors:\n" + str(vector_train_sum))
vector_test_sum = np.sum(test_data, axis=1)
print("Sum Of Test Vectors:\n" + str(vector_test_sum))

Sum Of Train Vectors:
[ 6. 18. 27.]
Sum Of Test Vectors:
[30. 60.]


In [405]:
rel_freq_train = train_data/vector_train_sum[:, np.newaxis]
print("Relative Freq Train Vectors:\n" + str(rel_freq_train))

rel_freq_test = test_data/vector_test_sum[:, np.newaxis]
print("Relative Freq Test Vectors:\n" + str(rel_freq_test))

Relative Freq Train Vectors:
[[0.16666667 0.33333333 0.5       ]
 [0.27777778 0.33333333 0.38888889]
 [0.2962963  0.33333333 0.37037037]]
Relative Freq Test Vectors:
[[0.16666667 0.33333333 0.5       ]
 [0.16666667 0.33333333 0.5       ]]


In [408]:
diff_rel_freq = rel_freq_train-rel_freq_test[:, np.newaxis]
print("Difference Between Relative Freq of Vectors:\n" + str(diff_rel_freq))

diff_rel_freq_square = np.square(diff_rel_freq)
print("Difference Between Relative Freq of Vectors Squared:\n" + str(diff_rel_freq_square))

Difference Between Relative Freq of Vectors:
[[[ 0.          0.          0.        ]
  [ 0.11111111  0.         -0.11111111]
  [ 0.12962963  0.         -0.12962963]]

 [[ 0.          0.          0.        ]
  [ 0.11111111  0.         -0.11111111]
  [ 0.12962963  0.         -0.12962963]]]
Difference Between Relative Freq of Vectors Squared:
[[[0.         0.         0.        ]
  [0.01234568 0.         0.01234568]
  [0.01680384 0.         0.01680384]]

 [[0.         0.         0.        ]
  [0.01234568 0.         0.01234568]
  [0.01680384 0.         0.01680384]]]


In [423]:
recips_prod_with_diffs_rel_freq = all_col_sum_recip * diff_rel_freq_square
print("Product of Col Sums Recips and Diffs Of Rel Freq Squared:\n" + str(recips_prod_with_diffs_rel_freq))

sums_of_recips_prod_diffs_rel_freq = np.sum(all_col_sum_recip * diff_rel_freq_square, axis=2)
print("Sums of Products of Col Sums Recips and Diffs Of Rel Freq Squared:\n" + str(sums_of_recips_prod_diffs_rel_freq))

chisqr = np.sqrt(np.sum(all_col_sum_recip * diff_rel_freq_square, axis=2))
print("Chi Squared Stats: \n" + str(chisqr))

Product of Col Sums Recips and Diffs Of Rel Freq Squared:
[[[0.         0.         0.        ]
  [0.00123457 0.         0.00056117]
  [0.0012926  0.         0.00067215]]

 [[0.         0.         0.        ]
  [0.00082305 0.         0.00033367]
  [0.00093355 0.         0.0004201 ]]]
Sums of Products of Col Sums Recips and Diffs Of Rel Freq Squared:
[[0.         0.00179574 0.00196476]
 [0.         0.00115671 0.00135364]]
Chi Squared Stats: 
[[0.         0.04237612 0.04432558]
 [0.         0.03401047 0.03679188]]


In [431]:
from knn.distance_metrics import chisqr

rand_train = np.random.randint(11, size=(100, 10))
rand_test = np.random.randint(11, size=(100, 10))


def chisqr_vect(train_data, test_data):
    all_col_sum = train_data + test_data[:, np.newaxis]
    all_col_sum_recip = np.reciprocal(all_col_sum, where=(all_col_sum != 0.0))
    vector_train_sum = np.sum(train_data, axis=1)
    vector_test_sum = np.sum(test_data, axis=1)
    
    rel_freq_train = np.divide(train_data, vector_train_sum[:, np.newaxis], 
                               out=np.full([train_data.shape[0], train_data.shape[1]], np.nan),
                               where=(vector_train_sum[:, np.newaxis] != 0))
    
    rel_freq_test = np.divide(test_data, vector_test_sum[:, np.newaxis], 
                               out=np.full([test_data.shape[0], test_data.shape[1]], np.nan),
                               where=(vector_test_sum[:, np.newaxis] != 0))
    
    diff_rel_freq_squared = np.square(rel_freq_train-rel_freq_test[:, np.newaxis])
    chisqr = np.sqrt(np.sum(all_col_sum_recip * diff_rel_freq_squared, axis=2))
    return chisqr

def chisqr_no_vect(train_data, test_data):
    distance_matrix = np.zeros((test_data.shape[0], train_data.shape[0]))
    for col, train_vect in enumerate(train_data):
        for row, test_vect in enumerate(test_data):
            distance_matrix[row, col] = chisqr(train_vect, test_vect)
    return distance_matrix



print("Vectorized Result: \n" + str(chisqr_vect(train_data, test_data)))
print("Non-Vectorized Result: \n" + str(chisqr_no_vect(train_data, test_data)))

print("Vectorized Timing:")
%time result = chisqr_vect(rand_train, rand_test)
print("Non-Vectorized Timing:")
%time result = chisqr_no_vect(rand_train, rand_test)

Vectorized Result: 
[[0.         0.04237612 0.04432558]
 [0.         0.03401047 0.03679188]]
Non-Vectorized Result: 
[[0.         0.04237612 0.04432558]
 [0.         0.03401047 0.03679188]]
Vectorized Timing:
CPU times: user 2.15 ms, sys: 559 µs, total: 2.71 ms
Wall time: 1.99 ms
Non-Vectorized Timing:
CPU times: user 306 ms, sys: 10.7 ms, total: 317 ms
Wall time: 310 ms


## Benchmarking With KNN

In [434]:
rand_train = np.random.randn(1000,10)
rand_test = np.random.randn(1000,10)

In [435]:
def knn_bf_introselect(train, test , distance, k=1):
    distance_matrix = distance(train, test)
    k_smallest_ind = np.argpartition(distance_matrix, k-1)[:,:k]
    
    smallest_k_matrix = np.zeros((test.shape[0], k), dtype=[("index",int), ("dist",float)])
    for i, ind_set in enumerate(k_smallest_ind):
        for j, element in enumerate(ind_set):
            smallest_k_matrix[i, j] = (element, distance_matrix[i, element])
    
    return smallest_k_matrix
    

print("KNN - Introselect k=1: \n" + str(knn_bf_introselect(train_data, test_data, euclidean_vect , 2)))

%timeit -r 15 result_1 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 1);
%timeit -r 15 result_2 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 3);
%timeit -r 15 result_3 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 5);
%timeit -r 15 result_4 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 7);

KNN - Introselect k=1: 
[[(2,  5.91607978) (1,  8.94427191)]
 [(2, 22.91287847) (1, 27.38612788)]]
16.6 ms ± 1.16 ms per loop (mean ± std. dev. of 15 runs, 100 loops each)
20.1 ms ± 348 µs per loop (mean ± std. dev. of 15 runs, 10 loops each)
30.2 ms ± 134 µs per loop (mean ± std. dev. of 15 runs, 10 loops each)
31.3 ms ± 230 µs per loop (mean ± std. dev. of 15 runs, 10 loops each)


In [436]:
import heapq

def knn_bf_max_heap(train, test, distance, k=1):
    distance_matrix = distance(train, test)
    
    smallest_k_matrix = np.zeros((test.shape[0], k), dtype=[("index",int), ("dist",float)])
    for index, row in enumerate(distance_matrix):
        heapq.heapify(row.tolist())
        smallest_k_matrix[index,:] = heapq.nsmallest(k, enumerate(row), key=lambda x: x[1])
        
    return smallest_k_matrix
    
    
print("KNN - Introselect k=1: \n" + str(knn_bf_max_heap(train_data, test_data, euclidean_vect , 2)))
%timeit -r 15 result_1 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 1);
%timeit -r 15 result_2 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 3);
%timeit -r 15 result_3 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 5);
%timeit -r 15 result_4 = knn_bf_introselect(rand_train, rand_test, cosine_vect, 7);

KNN - Introselect k=1: 
[[(2,  5.91607978) (1,  8.94427191)]
 [(2, 22.91287847) (1, 27.38612788)]]
16.6 ms ± 1.39 ms per loop (mean ± std. dev. of 15 runs, 100 loops each)
20.7 ms ± 790 µs per loop (mean ± std. dev. of 15 runs, 10 loops each)
31.5 ms ± 2.6 ms per loop (mean ± std. dev. of 15 runs, 10 loops each)
32.2 ms ± 771 µs per loop (mean ± std. dev. of 15 runs, 10 loops each)
