In [2]:
import numpy as np
from scipy.cluster.vq import whiten, vq, kmeans, kmeans2

#### dataset train vectors with d = 6

In [46]:
data = np.array([[1, 2, 2, 1, 1, 2],
               [7, 8, 9, 10, 11, 12],
               [1, 4, 1, 2, 1, 3],
               [9, 11, 7, 6, 2, 1], 
               [7, 2, 1, 2, 3, 1], 
               [4, 6, 2, 5, 1, 4], 
               [2, 5, 7, 11, 1, 8],
               [4, 1, 1, 2, 6, 3],
               [3, 10, 2, 6, 1, 1],
               [1, 2, 1, 2, 3, 2],
               [8, 12, 1, 6, 10, 2],
               [12, 11, 8, 10, 11, 12] ])
data.shape

(12, 6)

#### query vector

In [54]:
query_x = np.array([[2, 1, 2, 1, 3, 1]])

#### correct neigbours for comparison

In [50]:
from sklearn.neighbors import NearestNeighbors
k = 5
true_classifier = NearestNeighbors(n_neighbors=k, metric='l2', algorithm='brute').fit(data)
corrects = true_classifier.kneighbors(query_x)

In [53]:
corrects[1] 

array([[9, 0, 7, 2, 4]])

#### 1. Divide each vector into subvector, whiten it and collect all together for clustering
 subvectors - 3 --> each subvector is 2-d 

In [60]:
n_subvectors = 3

In [61]:
all_subvectors = []
for x in xs:
    whiten_x = whiten(x)
    x_subvectors = np.split(whiten_x, n_subvectors)
    all_subvectors += x_subvectors

In [62]:
all_subvectors[:5]

[array([2., 4.]),
 array([4., 2.]),
 array([2., 4.]),
 array([4.09878031, 4.68432035]),
 array([5.26986039, 5.85540044])]

#### 2.Obtain centroids using kmeans
number of centroids - 6

In [65]:
n_centroids = 6

In [66]:
centroids = kmeans(all_subvectors, n_centroids)

In [67]:
centroids 

(array([[4.15480719, 3.19824928],
        [2.70486824, 1.08364405],
        [5.84388529, 6.71931657],
        [0.5977435 , 1.30841654],
        [1.63864775, 3.2278191 ],
        [8.36747906, 8.36747906]]),
 0.714438360568577)

 the second value is mean val of dists from closest point in each cluster

In [68]:
centroids = centroids[0]

#### 3. Assign to subvectors of each vector indices of centrods

In [69]:
quantized_xs = []
for x in xs:
    whiten_x = whiten(x)
    x_subvectors = np.array(np.split(whiten_x, 3))
    quantized_x = vq(x_subvectors, centroids)[0]
    quantized_xs.append(quantized_x)

In [70]:
quantized_xs

[array([4, 0, 4], dtype=int32),
 array([0, 2, 2], dtype=int32),
 array([4, 3, 4], dtype=int32),
 array([4, 1, 3], dtype=int32),
 array([1, 3, 3], dtype=int32),
 array([4, 4, 3], dtype=int32),
 array([3, 4, 3], dtype=int32),
 array([1, 3, 1], dtype=int32),
 array([4, 3, 3], dtype=int32),
 array([4, 4, 0], dtype=int32),
 array([4, 3, 1], dtype=int32),
 array([5, 2, 5], dtype=int32)]

#### 4. Build lookup tables of distances between all centrods
for later fast dist calculation with query vector

In [72]:
lookup_table = np.zeros(shape=(n_centroids,n_centroids))

In [73]:
from scipy.spatial import distance
for i, centroid1 in enumerate(centroids):
    for j, centroid2 in enumerate(centroids):
        lookup_table[i,j] = distance.euclidean(centroid1, centroid2)
        

In [74]:
lookup_table

array([[ 0.        ,  2.56395753,  3.90524003,  4.02792377,  2.51633318,
         6.66839867],
       [ 2.56395753,  0.        ,  6.45090945,  2.11907937,  2.39464252,
         9.22601831],
       [ 3.90524003,  6.45090945,  0.        ,  7.53656705,  5.46576411,
         3.01412759],
       [ 4.02792377,  2.11907937,  7.53656705,  0.        ,  2.18348068,
        10.49757851],
       [ 2.51633318,  2.39464252,  5.46576411,  2.18348068,  0.        ,
         8.46718816],
       [ 6.66839867,  9.22601831,  3.01412759, 10.49757851,  8.46718816,
         0.        ]])

#### 5. Proccess search of k neigbours for query vector

In [84]:
query_x = np.array([2, 1, 2, 1, 3, 1])

##### 5.1 Quantize query similiar to train vectors

In [86]:
whiten_query = whiten(query_x)
query_subvectors = np.split(whiten_query, n_subvectors)
quantized_query = vq(query_subvectors, centroids)[0]

In [87]:
quantized_query

array([1, 1, 1], dtype=int32)

#### Calculate distance beetween quantized query and dataset vectors using lookup table
we know all distances between centroids, so just get 2 indices and see in the table. 
<br>
it's Product Quantization --> all distances of subvectors are multiplied for getting vectors' distance

In [88]:
eps = 0.00001
dists=[]
for ind, quantized_x in enumerate(quantized_xs):
    dist = 1
    for i,j in zip(quantized_query,quantized_x):
        dist*= (lookup_table[i,j]+eps)
    dists.append((dist, ind))

In [89]:
nearest_inds = [ind for _, ind in sorted(dists, key=lambda x: x[0])[:k]]

#### result of PQ:

In [90]:
nearest_inds

[7, 4, 10, 3, 6]

#### result of true scikit learn knn Classifier: 

In [101]:
corrects[1] 

array([[9, 0, 7, 2, 4]])

In [102]:
nearests = [xs[i] for i in nearest_inds]

In [109]:
print(query_x)
print('-'*50)
nearests

[2 1 2 1 3 1]
--------------------------------------------------


[array([4, 1, 1, 2, 6, 3]),
 array([7, 2, 1, 2, 3, 1]),
 array([ 8, 12,  1,  6, 10,  2]),
 array([ 9, 11,  7,  6,  2,  1]),
 array([ 2,  5,  7, 11,  1,  8])]

<hr>

In [112]:
data

array([[ 1,  2,  2,  1,  1,  2],
       [ 7,  8,  9, 10, 11, 12],
       [ 1,  4,  1,  2,  1,  3],
       [ 9, 11,  7,  6,  2,  1],
       [ 7,  2,  1,  2,  3,  1],
       [ 4,  6,  2,  5,  1,  4],
       [ 2,  5,  7, 11,  1,  8],
       [ 4,  1,  1,  2,  6,  3],
       [ 3, 10,  2,  6,  1,  1],
       [ 1,  2,  1,  2,  3,  2],
       [ 8, 12,  1,  6, 10,  2],
       [12, 11,  8, 10, 11, 12]])