In [11]:
!jupyter nbextension disable --py widgetsnbextension

Disabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


# What is FAISS?
FAISS (Facebook AI Similarity Search) is a library developed by Meta for efficient similarity search and clustering of dense vectors. It is widely used in NLP to quickly find itsme that are most similar to a query.

In [12]:
# Install the dependencies
!pip install sentence-transformers faiss-cpu



In [13]:
# Lets import the libraries
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [14]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

In [15]:
# Example corpus
documents = [
    "Generative AI can create images and text from prompts.",
    "Natural Language Processing helps computers understand human language.",
    "Transformers are a type of model architecture used in NLP.",
    "Energy companies leverage AI to optimize energy assets and predict failures.",
    "AI can help automate repetitive tasks and analyze large datasets."
]

In [16]:
# Generate embeddings for all documents
doc_embeddings = embed_model.encode(documents, convert_to_numpy=True)
print(doc_embeddings)
print(doc_embeddings.shape)

[[-0.02315177 -0.01696142 -0.00582933 ...  0.11804847  0.05939246
  -0.01970866]
 [ 0.02748729  0.01165652  0.06819661 ...  0.10725056  0.03855791
  -0.04503505]
 [-0.09566518  0.00192362 -0.01839649 ...  0.07424638  0.08372862
   0.00289675]
 [-0.03533467  0.05540608  0.05584577 ...  0.03470255 -0.00045385
  -0.03001654]
 [-0.05195835 -0.0279589   0.01073991 ...  0.05778465 -0.02580435
  -0.06548047]]
(5, 384)


In [17]:
# Build FAISS index
# Get the vector dimension
dimension = doc_embeddings.shape[1]
print('Dimension = ',dimension)
# Creates a FAISS index that performs exact nearest neighbor search using L2 distance (Euclidean distance).
index = faiss.IndexFlatL2(dimension)
# Add embeddings to the index
index.add(doc_embeddings)

Dimension =  384


In [18]:
def semantic_search(query, top_k=3):
    # Convert query to embedding
    query_emb = embed_model.encode([query], convert_to_numpy=True)

    # Search in FAISS index
    distances, indices = index.search(query_emb, top_k)

    # Return top-k documents with distances
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            "document": documents[idx],
            "distance": distances[0][i]
        })
    return results

In [19]:
query = "How AI is used in energy optimization?"
results = semantic_search(query)

print("Top results:")
for r in results:
    print(f"Document: {r['document']}\nDistance: {r['distance']:.4f}\n")

Top results:
Document: Energy companies leverage AI to optimize energy assets and predict failures.
Distance: 0.5971

Document: AI can help automate repetitive tasks and analyze large datasets.
Distance: 1.0559

Document: Natural Language Processing helps computers understand human language.
Distance: 1.3535

