In [1]:
import numpy as np
import faiss
from sklearn.neighbors import KDTree #KDTree for fast kNN search
from pytictoc import TicToc

Failed to load GPU Faiss: No module named swigfaiss_gpu
Faiss falling back to CPU-only.


In [2]:
time = TicToc() # timer

In [38]:
d = 64                      # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

In [39]:
np.arange(nq) / 1000.

array([  0.00000000e+00,   1.00000000e-03,   2.00000000e-03, ...,
         9.99700000e+00,   9.99800000e+00,   9.99900000e+00])

In [40]:
# brute force example with multithreading 
time.tic()
index = faiss.IndexFlatL2(d) # build index with dim d
time.toc()
index.is_trained # automatic trained

Elapsed time is 0.000613 seconds.


True

In [41]:
time.tic()
index.add(xb)
time.toc()
print index.ntotal

Elapsed time is 0.007320 seconds.
100000


In [42]:
k = 4 # k-Nearest Neighbor
time.tic()
dist, ide = index.search(xq, k)
time.toc()
print dist
print ide

Elapsed time is 1.650822 seconds.
[[ 6.81549835  6.88946533  7.39567947  7.42902565]
 [ 6.60411072  6.67969513  6.72096252  6.82868195]
 [ 6.47038651  6.85786057  7.00437927  7.03656387]
 ..., 
 [ 6.07269287  6.57675171  6.61395264  6.7322998 ]
 [ 6.63751221  6.64874268  6.85787964  7.00964355]
 [ 6.21836853  6.45251465  6.54876709  6.58129883]]
[[  381   207   210   477]
 [  526   911   142    72]
 [  838   527  1290   425]
 ..., 
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [8]:
# KD tree is really slow
time.tic()
kd_tree = KDTree(xb)
time.toc()

Elapsed time is 4.207135 seconds.


In [9]:
time.tic()
dist, ide = kd_tree.query(xq, k)
time.toc()
print dist
print ide

Elapsed time is 6.896273 seconds.
[[ 2.61065333  2.62477918  2.71949963  2.72562302]
 [ 2.56984655  2.58451163  2.59248252  2.61317433]
 [ 2.54369485  2.61875207  2.64657795  2.65265269]
 ..., 
 [ 2.46428394  2.56450564  2.5717739   2.59465104]
 [ 2.57630874  2.57852202  2.6187449   2.6475746 ]
 [ 2.49366122  2.54017325  2.55904871  2.56540661]]
[[  381   207   210   477]
 [  526   911   142    72]
 [  838   527  1290   425]
 ..., 
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [43]:
# Approximate kNN --> search only in the corresponding bin
nlist = 10
quantizer = faiss.IndexFlatL2(d)  # the other index
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
       # here we specify METRIC_L2, by default it performs inner-product search
time.tic()
index.train(xb)
time.toc()

Elapsed time is 0.028498 seconds.


In [44]:
time.tic()
index.add(xb)                  # add may be a bit slower as well
time.toc()
time.tic()
dist, ide = index.search(xq, k)
time.toc()

Elapsed time is 0.024559 seconds.
Elapsed time is 0.495172 seconds.


In [45]:
print ide

[[  381   207   210   477]
 [  526   911   142    72]
 [  838   527  1290   425]
 ..., 
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [None]:
# lossy compression in the model
