In [2]:
pip install faiss-cpu



In [3]:
import numpy as np
import faiss
import time

In [4]:
# Generate 1 million vectors, each vector has 1000 dimensions
dimension = 1000
n_vectors = 1000000
xb = np.random.random((n_vectors, dimension)).astype('float32')

# Generate the query vectors
xq = np.random.random((10, dimension)).astype('float32')

In [5]:
## ANN search based on IndexIVFFlat
start = time.time()
quantizer = faiss.IndexFlatL2(dimension) ## similarity measurement during K-means
index_ivf = faiss.IndexIVFFlat(quantizer, dimension, 100) # You regard this as K-means parameters: similarity meansurement, vector's dimension, cluster group count
index_ivf.train(xb)
index_ivf.add(xb)
index_ivf.nprobe = 10 # During query, how many closest cluster will be used to search KNN
print("IVF Flat train time:", time.time() - start, "seconds")


IVF Flat train time: 17.957215309143066 seconds


In [6]:
# IVF search
startIVFquery = time.time()
D2, I2 = index_ivf.search(xq, 5)
print("Time (IVF):", time.time() - startIVFquery, "seconds")
avgNearestDistance = np.mean(D2[:, 0])
print("IVF query average similarity:", avgNearestDistance)

Time (IVF): 0.8736310005187988 seconds
IVF query average similarity: 140.00546


In [4]:
# FlatL2 Index
startL2Index = time.time()
index_flatL2 = faiss.IndexFlatL2(dimension)
index_flatL2.add(xb)
print("FlatL2 index time:", time.time() - startL2Index, "seconds")

FlatL2 index time: 3.560131788253784 seconds


In [5]:
# FlatL2 query
startL2Query = time.time()
D,I = index_flatL2.search(xq, k=10)
print("FlatL2 query time:", time.time() - startL2Query, "seconds")
avgNearestDistance = np.mean(D[:, 0])
print("FlatL2 query average similarity:", avgNearestDistance)

FlatL2 query time: 5.048878192901611 seconds
FlatL2 query average similarity: 140.28873
