In [2]:
import numpy as np
import faiss

下面我们构建了shape为[100000,64]的训练数据xb，和shape为[10000,64]的查询数据xq。
然后创建索引(Index)。faiss创建索引对向量预处理，提高查询效率。
faiss提供多种索引方法，这里选择最简单的暴力检索L2距离的索引：IndexFlatL2。
创建索引时必须指定向量的维度d。大部分索引需要训练的步骤。IndexFlatL2跳过这一步。
当索引创建好并训练(如果需要)之后，我们就可以执行add和search方法了。add方法一般添加训练时的样本，search就是寻找相似相似向量了。
一些索引可以保存整型的ID，每个向量可以指定一个ID，当查询相似向量时，会返回相似向量的ID及相似度(或距离)。如果不指定，将按照添加的顺序从0开始累加。其中IndexFlatL2不支持指定ID。

In [3]:
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.

xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000

In [4]:
print(xq.shape)
xq

(10000, 64)


array([[ 0.81432974,  0.7409969 ,  0.8915324 , ...,  0.72459674,
         0.893881  ,  0.6574571 ],
       [ 0.5844774 ,  0.797842  ,  0.74140453, ...,  0.6768835 ,
         0.05907924,  0.6396156 ],
       [ 0.75040764,  0.02659794,  0.5495097 , ...,  0.69562465,
         0.16268532,  0.76653737],
       ...,
       [10.96773   ,  0.05037309,  0.7342035 , ...,  0.89510185,
         0.6490696 ,  0.86151606],
       [10.831193  ,  0.70606154,  0.1922274 , ...,  0.8026039 ,
         0.6854174 ,  0.60209423],
       [10.078484  ,  0.39106598,  0.01359335, ...,  0.63193923,
         0.12561724,  0.78384215]], dtype=float32)

In [5]:
print(xb.shape)

(100000, 64)


In [6]:
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
100000


In [7]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
D, I = index.search(xq, k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:]) 

[[  0 393 363  78]
 [  1 555 277 364]
 [  2 304 101  13]
 [  3 173  18 182]
 [  4 288 370 531]]
[[0.        7.1751733 7.207629  7.2511625]
 [0.        6.3235645 6.684581  6.7999454]
 [0.        5.7964087 6.391736  7.2815123]
 [0.        7.2779055 7.5279865 7.6628466]
 [0.        6.7638035 7.2951202 7.3688145]]
[[ 381  207  210  477]
 [ 526  911  142   72]
 [ 838  527 1290  425]
 [ 196  184  164  359]
 [ 526  377  120  425]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [10]:
ncentroids = 1024
niter = 20
verbose = True
d = xb.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(xb)

493291.4375

In [11]:
D, I = kmeans.index.search(xb, 1)

In [16]:
index = faiss.IndexFlatL2 (d)
index.add (xb)
D, I = index.search (kmeans.centroids, 15)

In [17]:
D, I

(array([[3.4169922, 3.5654297, 3.6054688, ..., 4.1464844, 4.171875 ,
         4.1972656],
        [3.579834 , 3.8916016, 3.909668 , ..., 4.230957 , 4.246582 ,
         4.251953 ],
        [3.6281738, 3.7556152, 3.8186035, ..., 4.159424 , 4.198242 ,
         4.203125 ],
        ...,
        [3.552063 , 3.6102295, 3.7182007, ..., 4.144287 , 4.157837 ,
         4.1624756],
        [3.756836 , 3.8466797, 3.8769531, ..., 4.216797 , 4.2197266,
         4.270508 ],
        [3.4956055, 3.840332 , 3.8774414, ..., 4.119629 , 4.1450195,
         4.1572266]], dtype=float32),
 array([[78305, 79083, 78056, ..., 78392, 78483, 78937],
        [42086, 41925, 41587, ..., 42040, 41297, 41972],
        [36808, 36129, 35861, ..., 36347, 36215, 36148],
        ...,
        [18369, 18180, 18085, ..., 18735, 18391, 18274],
        [66867, 66755, 66855, ..., 66353, 66582, 67782],
        [50521, 51186, 51709, ..., 50421, 50347, 51024]]))

In [19]:
# random training data 
mt = np.random.rand(1000, 40).astype('float32')
mat = faiss.PCAMatrix (40, 10)
mat.train(mt)
assert mat.is_trained
tr = mat.apply_py(mt)
# print this to show that the magnitude of tr's columns is decreasing
print (tr ** 2)

[[1.0908752e-01 1.4267038e-01 9.2699803e-02 ... 5.7040593e-03
  1.1509654e-02 2.1818769e-03]
 [2.1679342e-01 9.7462781e-02 7.4876748e-02 ... 1.7625001e-01
  4.7052973e-03 3.4394869e-01]
 [2.9028225e-01 1.2606172e-02 2.6862552e-02 ... 1.7669318e-02
  4.6422832e-02 2.4922919e-01]
 ...
 [5.6083632e-01 1.6114089e-03 6.9299646e-02 ... 3.6375772e-03
  3.1397530e-01 1.0202462e-02]
 [7.3482081e-02 3.4825212e-01 3.6024296e-01 ... 1.2723800e-02
  1.9583536e-02 1.9091022e-01]
 [6.8052732e-03 9.7751677e-02 4.1428109e-05 ... 2.0175084e-01
  1.3215284e-02 2.9558161e-01]]


In [21]:
d = 32  # data dimension
cs = 4  # code size (bytes)

# train set 
nt = 10000
xt = np.random.rand(nt, d).astype('float32')

In [22]:
# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')

In [23]:
pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

In [28]:
# encode 
codes = pq.compute_codes(x)
codes

array([[171, 216, 254, 231],
       [124, 165, 198, 117],
       [242,  97, 142, 129],
       ...,
       [ 27,   8, 236,  47],
       [100,  20,  44, 204],
       [ 53,   2, 240,  91]], dtype=uint8)

In [29]:
# decode
x2 = pq.decode(codes)
x2

array([[0.63194066, 0.8661645 , 0.25449798, ..., 0.26036108, 0.24984792,
        0.20092787],
       [0.41300687, 0.84542674, 0.8701078 , ..., 0.7221007 , 0.33113563,
        0.25003377],
       [0.19998284, 0.2221175 , 0.7560032 , ..., 0.34117806, 0.21012492,
        0.75330764],
       ...,
       [0.14300309, 0.75912726, 0.28484973, ..., 0.32147613, 0.685157  ,
        0.79821724],
       [0.23903379, 0.3476817 , 0.1312695 , ..., 0.6657561 , 0.1298468 ,
        0.67381585],
       [0.2292362 , 0.69820523, 0.1895953 , ..., 0.5753735 , 0.7386811 ,
        0.28762567]], dtype=float32)

In [26]:
# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()