In [None]:
import shutil
import urllib.request as request
from contextlib import closing

# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

!wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz

In [2]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

wb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [37]:
import faiss

D = xq.shape[1]
m = 8

nbits = 8
k_ = 2**nbits
index = faiss.IndexPQ(D, m, nbits)
print(f"{D=}, {k_=}")

D=128, k_=256


In [7]:
wb.shape

(1000000, 128)

In [38]:
index.train(wb)
index.add(wb)

In [39]:
k = 100
dist, I = index.search(xq, k)

In [10]:
%%timeit
index.search(xq, k)

374 µs ± 125 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
l2_index = faiss.IndexFlatL2(D)
l2_index.add(wb)

In [14]:
l2_dist, l2_I = l2_index.search(xq, k)

In [40]:
sum([1 for i in I[0] if i in l2_I])

38

array([[3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38, 3.4028235e

In [None]:
import os

def get_memory(index):
    # write index to file
    faiss.write_index(index, './temp.index')
    # get file size
    file_size = os.path.getsize('./temp.index')
    # delete saved index
    os.remove('./temp.index')
    return file_size


In [None]:
get_memory(l2_index)

In [None]:
get_memory(index)

In [41]:
vecs = faiss.IndexFlatL2(D)

nlist = 2048  # how many Voronoi cells (must be >= k* which is 2**nbits)
nbits = 8  # when using IVF+PQ, higher nbits values are not supported
index = faiss.IndexIVFPQ(vecs, D, nlist, m, nbits)
print(f"{2**nbits=}")  # our value for nlist

2**nbits=256


In [42]:
index.train(wb)
index.add(wb)

In [31]:
dist, I = index.search(xq, k)

In [32]:
%%timeit
index.search(xq, k)

28.2 µs ± 238 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [33]:
sum([1 for i in I[0] if i in l2_I])

27

In [43]:
index.nprobe = 2048
dist, I = index.search(xq, k)
sum([1 for i in I[0] if i in l2_I])

39

In [44]:
index.nprobe = 2
dist, I = index.search(xq, k)
sum([1 for i in I[0] if i in l2_I])

27