# Faiss 示例

参考：

- [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started)
- [Faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search)
- [Lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint)

## 准备数据

In [3]:
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

print('xb')
print(xb)
print(xb.size)

print('xq')
print(xq)
print(xq.size)

xb
[[1.91519454e-01 6.22108757e-01 4.37727749e-01 ... 6.24916732e-01
  4.78093803e-01 1.95675179e-01]
 [3.83317441e-01 5.38736843e-02 4.51648414e-01 ... 1.51395261e-01
  3.35174650e-01 6.57551765e-01]
 [7.53425434e-02 5.50063960e-02 3.23194802e-01 ... 3.44416976e-01
  6.40880406e-01 1.26205325e-01]
 ...
 [1.00811470e+02 5.90245306e-01 7.98893511e-01 ... 3.39859009e-01
  3.01949501e-01 8.53854537e-01]
 [1.00669464e+02 9.16068792e-01 9.55078781e-01 ... 5.95364332e-01
  3.84918079e-02 1.05637990e-01]
 [1.00855637e+02 5.91134131e-01 6.78907931e-01 ... 2.18976989e-01
  6.53015897e-02 2.17538327e-01]]
6400000
xq
[[ 0.81432974  0.7409969   0.8915324  ...  0.72459674  0.893881
   0.6574571 ]
 [ 0.5844774   0.797842    0.74140453 ...  0.6768835   0.05907924
   0.6396156 ]
 [ 0.75040764  0.02659794  0.5495097  ...  0.69562465  0.16268532
   0.76653737]
 ...
 [10.96773     0.05037309  0.7342035  ...  0.89510185  0.6490696
   0.86151606]
 [10.831193    0.70606154  0.1922274  ...  0.8026039   0.685

## 构建索引和在索引中加入向量

In [4]:
%%time

import faiss                   # make faiss available
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
100000
CPU times: user 10.9 ms, sys: 4.64 ms, total: 15.6 ms
Wall time: 14.4 ms


## 搜索

In [5]:
%%time

k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print('--')
print(D)
print('--')
D, I = index.search(xq, k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print('--')
print(I[-5:])                  # neighbors of the 5 last queries

[[  0 393 363  78]
 [  1 555 277 364]
 [  2 304 101  13]
 [  3 173  18 182]
 [  4 288 370 531]]
--
[[0.        7.1751738 7.20763   7.2511625]
 [0.        6.3235645 6.684581  6.799946 ]
 [0.        5.7964087 6.391736  7.2815123]
 [0.        7.2779055 7.5279875 7.662846 ]
 [0.        6.7638035 7.2951202 7.3688145]]
--
[[ 381  207  210  477]
 [ 526  911  142   72]
 [ 838  527 1290  425]
 [ 196  184  164  359]
 [ 526  377  120  425]]
--
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]
CPU times: user 3.22 s, sys: 157 ms, total: 3.37 s
Wall time: 2.47 s


## 加快搜索

In [6]:
%%time

nlist = 100
k = 4
quantizer = faiss.IndexFlatL2(d)  # the other index
index = faiss.IndexIVFFlat(quantizer, d, nlist)
assert not index.is_trained
index.train(xb)
assert index.is_trained

index.add(xb)                  # add may be a bit slower as well
D, I = index.search(xq, k)     # actual search
print(I[-5:])                  # neighbors of the 5 last queries
index.nprobe = 10              # default nprobe is 1, try a few more
D, I = index.search(xq, k)
print(I[-5:])                  # neighbors of the 5 last queries

[[ 9900  9309  9810 10048]
 [11055 10895 10812 11321]
 [11353 10164  9787 10719]
 [10571 10664 10632 10203]
 [ 9628  9554  9582 10304]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]
CPU times: user 1.23 s, sys: 42.5 ms, total: 1.27 s
Wall time: 729 ms


## 更低的内存占用

In [7]:
%%time

nlist = 100
m = 8                             # number of subquantizers
k = 4
quantizer = faiss.IndexFlatL2(d)  # this remains the same
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
                                    # 8 specifies that each sub-vector is encoded as 8 bits
index.train(xb)
index.add(xb)
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
index.nprobe = 10              # make comparable with experiment above
D, I = index.search(xq, k)     # search
print(I[-5:])

[[   0   78  424  608]
 [   1  555 1063  380]
 [   2  179  304   33]
 [   3  265  527  139]
 [   4  288  531  827]]
[[1.5948839 6.2905183 6.3059173 6.5697308]
 [1.282814  5.9320083 5.9512424 6.5773916]
 [1.698801  6.1244454 6.2004113 6.401427 ]
 [1.82021   6.721145  6.8056335 6.995647 ]
 [1.4857421 5.8346977 6.303232  6.400938 ]]
[[ 8746  9380 10914  9842]
 [10913 11373  9014 10507]
 [10719 11291 10600 11353]
 [11578  9671 10709 10664]
 [ 9229  9878  9905  9459]]
CPU times: user 3.59 s, sys: 74.6 ms, total: 3.67 s
Wall time: 2.02 s


## 示意性的最简单代码

In [11]:
%%time

import numpy as np
import faiss

# 数据准备
d = 5  # 向量维度
nb = 2  # 数据库中向量的数量
xb = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]], dtype='float32')  # 数据库中的向量
queries = np.array([[1, 2, 2, 4, 7]], dtype='float32')  # 查询向量

# 构建索引
index = faiss.IndexFlatL2(d)
index.add(xb)

# 进行查询
k = 1  # 找出最匹配的结果
D, I = index.search(queries, k)

# 打印结果
print("查询结果：")
print("最匹配的向量索引：", I[0][0])
print("最匹配的向量：", xb[I[0][0]])

查询结果：
最匹配的向量索引： 0
最匹配的向量： [1. 2. 3. 4. 5.]
CPU times: user 485 µs, sys: 0 ns, total: 485 µs
Wall time: 469 µs
