Implement a FAISS Search based on https://towardsdatascience.com/understanding-faiss-619bb6db2d1a & https://www.pinecone.io/learn/faiss-tutorial/ 

In [1]:
import numpy as np 
import pandas as pd
import faiss , sys

newImageList = pd.read_json("embeddings.json")
#change the individual rows into numpy arrays of type float32
db_vectors = [np.array(x, dtype="float32") for x in newImageList['embedding']]

#change it from a list of arrays to an array of arrays.
db_vectors = np.array(db_vectors)

#check everything went ok
print(type(db_vectors))
print( type(db_vectors[0]))
print(db_vectors[0])

dimension = len(db_vectors[0])    # dimensions of each vector                         
n = len(db_vectors)    # number of vectors  

print(dimension,n)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.41342562 0.39609057 0.29425067 ... 0.10239422 0.22096485 0.5322781 ]
2048 12604


OK, so we've got an array of arrays in memory. Cool, let's index it.

In [2]:
nlist = int(13)  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist,   faiss.METRIC_L2)

index.train(db_vectors)
index.add(db_vectors)

OK, that seems to have worked. Let's save the index to disk

In [3]:
faiss.write_index(index,"images_faiss.index")

Let's also try an example search:

In [15]:
import image_processor
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = torch.load('model_evaluation/feature/best_weights.pt')
searchEmbedding  = image_processor.getEmbedding(image='images/clean_image_data/0a1baaa8-4556-4e07-a486-599c05cce76c.jpg',model=model)
searchEmbedding = np.array(searchEmbedding,dtype="float32",ndmin=2)

nprobe = 5
distances, indices = index.search(x=searchEmbedding,k=3)


In [18]:

print(newImageList["id"][indices[0]] == "0a1baaa8-4556-4e07-a486-599c05cce76c")

6185     True
5682    False
5680    False
Name: id, dtype: bool
