<a href="https://colab.research.google.com/github/Muntasir2179/vector-database-learning/blob/main/FAISS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FAISS

* An AI library developed by Facebook.
* A library for efficiant similarity search.
* It has lot's of indexes to compute the approximate nearest neighborus vectors.
* They used Euclidean distance (L2) as a distance calculator.

https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/

https://github.com/facebookresearch/faiss/wiki

https://github.com/facebookresearch/faiss/wiki/Faiss-indexes


In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [2]:
# Efficient, High dimentional indexing, GPU Accelaration, Versatility - Flat, IVF, PQ
import faiss
import numpy as np

In [7]:
# generate some random vectors for demonstration
dimension = 64
num_vectors = 1000

query_vector = np.random.random((1, dimension)).astype('float32')
data_vectors = np.random.random((num_vectors, dimension)).astype('float32')

In [8]:
query_vector

array([[0.9932728 , 0.33787355, 0.33527374, 0.39188364, 0.5995351 ,
        0.28439292, 0.31221855, 0.7154759 , 0.25144863, 0.09005238,
        0.4547749 , 0.59130615, 0.17342636, 0.3503499 , 0.23633611,
        0.7707198 , 0.72179   , 0.19349201, 0.59803814, 0.21339674,
        0.01577327, 0.29388714, 0.94400555, 0.03979702, 0.01720465,
        0.93168986, 0.8199632 , 0.5674565 , 0.30094662, 0.09998444,
        0.8607662 , 0.4509885 , 0.5405522 , 0.4346473 , 0.5113422 ,
        0.9282156 , 0.22377856, 0.4692353 , 0.50481117, 0.78731906,
        0.91413915, 0.90727466, 0.5191103 , 0.2164343 , 0.5680139 ,
        0.47461796, 0.8368694 , 0.35883552, 0.7108311 , 0.27201176,
        0.4496254 , 0.25079197, 0.6736545 , 0.7261447 , 0.3398566 ,
        0.73620725, 0.02766731, 0.50112754, 0.35258806, 0.7725332 ,
        0.89405215, 0.03154451, 0.03621171, 0.54656637]], dtype=float32)

In [9]:
len(query_vector)

1

In [10]:
data_vectors

array([[0.6334693 , 0.6409039 , 0.6956127 , ..., 0.32730928, 0.8262217 ,
        0.33912712],
       [0.05353097, 0.92386365, 0.65792835, ..., 0.7270941 , 0.04579284,
        0.26877815],
       [0.715858  , 0.5029775 , 0.9510248 , ..., 0.3194666 , 0.01511947,
        0.6320814 ],
       ...,
       [0.18523325, 0.84113055, 0.952441  , ..., 0.76215374, 0.20170404,
        0.3505453 ],
       [0.4456768 , 0.82598585, 0.14953357, ..., 0.70368105, 0.7090703 ,
        0.46619004],
       [0.6111498 , 0.2561166 , 0.4904543 , ..., 0.14536957, 0.75645936,
        0.01847457]], dtype=float32)

In [11]:
len(data_vectors)

1000

In [12]:
# create a simple 'flat' index
# index: data structure to help us perform efficiently similarity search
index = faiss.IndexFlatL2(dimension)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7b1e71560780> >

In [13]:
# index > vecotrs
# add data vectors to index
index.add(data_vectors)

In [15]:
# perform a vector search
k = 5  # number of nearest neighbours to retrieve
distance, indices = index.search(query_vector, k)

In [16]:
# print some results
print("Query Vector:\n", query_vector)
print("\nNearest Neighbors:")
for i in range(k):
  print(f"Index: {indices[0][i]}, Distance: {distance[0][i]}")

Query Vector:
 [[0.9932728  0.33787355 0.33527374 0.39188364 0.5995351  0.28439292
  0.31221855 0.7154759  0.25144863 0.09005238 0.4547749  0.59130615
  0.17342636 0.3503499  0.23633611 0.7707198  0.72179    0.19349201
  0.59803814 0.21339674 0.01577327 0.29388714 0.94400555 0.03979702
  0.01720465 0.93168986 0.8199632  0.5674565  0.30094662 0.09998444
  0.8607662  0.4509885  0.5405522  0.4346473  0.5113422  0.9282156
  0.22377856 0.4692353  0.50481117 0.78731906 0.91413915 0.90727466
  0.5191103  0.2164343  0.5680139  0.47461796 0.8368694  0.35883552
  0.7108311  0.27201176 0.4496254  0.25079197 0.6736545  0.7261447
  0.3398566  0.73620725 0.02766731 0.50112754 0.35258806 0.7725332
  0.89405215 0.03154451 0.03621171 0.54656637]]

Nearest Neighbors:
Index: 353, Distance: 6.318154335021973
Index: 454, Distance: 6.662356376647949
Index: 461, Distance: 6.704074859619141
Index: 61, Distance: 6.747608184814453
Index: 521, Distance: 6.861794471740723


In [17]:
q_vector = np.array([[10.0]*64], dtype='float32')
q_vector

array([[10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.]],
      dtype=float32)

In [18]:
dim = 64
n_vectors = 1000
data_vectors = np.random.normal(loc=10, scale=1, size=(n_vectors, dim)).astype('float32')

In [19]:
data_vectors

array([[ 9.531637 ,  9.962012 , 10.860139 , ..., 10.398864 , 10.066621 ,
        11.794895 ],
       [ 8.520374 ,  9.344363 ,  9.206383 , ..., 10.760792 , 11.253601 ,
         9.964624 ],
       [12.092402 , 10.210333 ,  9.572696 , ...,  9.476865 ,  8.029657 ,
        11.257387 ],
       ...,
       [ 9.516434 ,  9.4802265, 11.256768 , ...,  9.747739 , 10.38346  ,
        12.083225 ],
       [ 9.455781 ,  9.861315 ,  8.030725 , ...,  9.933279 , 12.311387 ,
        11.716488 ],
       [10.636586 , 11.185635 , 10.975823 , ...,  8.539745 , 11.396238 ,
        10.771271 ]], dtype=float32)

In [20]:
index = faiss.IndexFlatL2(dim)

In [21]:
index.add(data_vectors)

In [22]:
k = 5
distance, indices = index.search(q_vector, k)

In [25]:
distance, indices

(array([[34.110367, 36.836845, 37.05505 , 37.2392  , 38.14089 ]],
       dtype=float32),
 array([[259, 804, 623, 607, 950]]))

In [26]:
# print some results
print("Query Vector:\n", q_vector)
print("\nNearest Neighbors:")
for i in range(k):
  index_number = indices[0][i]
  distance_value = distance[0][i]
  actual_number = data_vectors[index_number]
  print(f"Index: {index_number}, Actual number: {actual_number}, Distance: {distance_value}")

Query Vector:
 [[10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10.
  10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10.
  10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10.
  10. 10. 10. 10. 10. 10. 10. 10. 10. 10.]]

Nearest Neighbors:
Index: 259, Actual number: [ 9.860562   9.967293   9.264588  11.488277  10.100862   9.557025
 11.406849  10.126077   9.265676   9.865199  10.665492   9.877952
 10.700755  10.887103  10.0175705 10.160958  10.267502  10.763585
  9.029133  10.228721  10.113755  11.155575  10.384308   9.777743
 10.366804  10.378546  10.190891  10.898963   8.5469475 10.171102
 10.404936  10.501387  10.605814   8.530529  11.547781   9.827949
 10.65582    9.446324   9.425369  10.021056   9.50651   10.529587
  9.399088   9.283451   9.479576  10.85793    9.699737   9.900868
 10.820981  10.116371  11.003421  10.439315  10.589415  10.387228
 11.567235   8.838104  10.179745  10.380811  11.911008   9.628653
  8.9770565 