In [1]:
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

In [2]:
xb

array([[1.91519454e-01, 6.22108757e-01, 4.37727749e-01, ...,
        6.24916732e-01, 4.78093803e-01, 1.95675179e-01],
       [3.83317441e-01, 5.38736843e-02, 4.51648414e-01, ...,
        1.51395261e-01, 3.35174650e-01, 6.57551765e-01],
       [7.53425434e-02, 5.50063960e-02, 3.23194802e-01, ...,
        3.44416976e-01, 6.40880406e-01, 1.26205325e-01],
       ...,
       [1.00811470e+02, 5.90245306e-01, 7.98893511e-01, ...,
        3.39859009e-01, 3.01949501e-01, 8.53854537e-01],
       [1.00669464e+02, 9.16068792e-01, 9.55078781e-01, ...,
        5.95364332e-01, 3.84918079e-02, 1.05637990e-01],
       [1.00855637e+02, 5.91134131e-01, 6.78907931e-01, ...,
        2.18976989e-01, 6.53015897e-02, 2.17538327e-01]], dtype=float32)

In [3]:
d

64

In [4]:
import faiss  

#faiss using product quantization for data compression and HNSW for coarse quantizer
class Faiss:
    def __init__(self):
        self.index = faiss.IndexFlatL2(d)   # build the index
        print(self.index.is_trained)
        self.index.add(xb)                  # add vectors to the index
        print(self.index.ntotal)
    
    def search(self,queries, k=4): # want to see k nearest neighbors
        D, I = self.index.search(xb[:5], k)
        return I,D



    
f = Faiss()
f.search(xb[:5])

True
100000


(array([[  0, 393, 363,  78],
        [  1, 555, 277, 364],
        [  2, 304, 101,  13],
        [  3, 173,  18, 182],
        [  4, 288, 370, 531]]),
 array([[0.       , 7.1751733, 7.2076297, 7.2511625],
        [0.       , 6.323565 , 6.684581 , 6.799946 ],
        [0.       , 5.7964087, 6.3917365, 7.2815123],
        [0.       , 7.2779055, 7.527987 , 7.6628466],
        [0.       , 6.7638035, 7.295121 , 7.368815 ]], dtype=float32))

In [6]:
d = 32  # data dimension
cs = 4  # code size (bytes)

# train set 
nt = 10000
xt = np.random.rand(nt, d).astype('float32')

# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')

pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

# encode 
codes = pq.compute_codes(x)

# decode
x2 = pq.decode(codes)

# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()

codes

array([[231, 211,  27,  19],
       [  5, 229, 242,   2],
       [177, 187,  52, 246],
       ...,
       [109, 104,  33,  32],
       [202, 115,  58, 236],
       [209, 158, 225, 115]], dtype=uint8)

In [7]:

import faiss

# Creating the index.
index = faiss.IndexHNSWFlat(d, M)            
index.hnsw.efConstruction = 40         # Setting the value for efConstruction.
index.hnsw.efSearch = 16               # Setting the value for efSearch.

# Adding vectors to the index (xb are database vectors that are to be indexed).
index.add(xb)                  

# xq are query vectors, for which we need to search in xb to find the k nearest neighbors.
# The search returns D, the pairwise distances, and I, the indices of the nearest neighbors.
D, I = index.search(xq, k)    

AssertionError: 