# What is FAISS?
FAISS (Facebook AI Similarity Search) is a library developed by Meta for efficient similarity search and clustering of dense vectors. It is widely used in NLP to quickly find itsme that are most similar to a query.

In [1]:
# Install the dependencies
!pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
# Lets import the libraries
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [3]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Example corpus
documents = [
    "Generative AI can create images and text from prompts.",
    "Natural Language Processing helps computers understand human language.",
    "Transformers are a type of model architecture used in NLP.",
    "Energy companies leverage AI to optimize energy assets and predict failures.",
    "AI can help automate repetitive tasks and analyze large datasets."
]

In [7]:
# Generate embeddings for all documents
doc_embeddings = embed_model.encode(documents, convert_to_numpy=True)
print(doc_embeddings)
print(doc_embeddings.shape)

[[-0.02315177 -0.01696142 -0.00582933 ...  0.11804847  0.05939246
  -0.01970866]
 [ 0.02748729  0.01165652  0.06819661 ...  0.10725056  0.03855791
  -0.04503505]
 [-0.09566518  0.00192362 -0.01839649 ...  0.07424638  0.08372862
   0.00289675]
 [ 0.02176006  0.09638759  0.01433729 ...  0.01874093  0.01154707
  -0.02024935]
 [-0.05195835 -0.0279589   0.01073991 ...  0.05778465 -0.02580435
  -0.06548047]]
(5, 384)


In [8]:
# Build FAISS index
# Get the vector dimension
dimension = doc_embeddings.shape[1]
print('Dimension = ',dimension)
# Creates a FAISS index that performs exact nearest neighbor search using L2 distance (Euclidean distance).
index = faiss.IndexFlatL2(dimension)
# Add embeddings to the index
index.add(doc_embeddings)

384


In [9]:
def semantic_search(query, top_k=3):
    # Convert query to embedding
    query_emb = embed_model.encode([query], convert_to_numpy=True)

    # Search in FAISS index
    distances, indices = index.search(query_emb, top_k)

    # Return top-k documents with distances
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            "document": documents[idx],
            "distance": distances[0][i]
        })
    return results

In [10]:
query = "How AI is used in energy optimization?"
results = semantic_search(query)

print("Top results:")
for r in results:
    print(f"Document: {r['document']}\nDistance: {r['distance']:.4f}\n")

Top results:
Document: Shell uses AI to optimize energy assets and predict failures.
Distance: 0.8300

Document: AI can help automate repetitive tasks and analyze large datasets.
Distance: 1.0559

Document: Natural Language Processing helps computers understand human language.
Distance: 1.3535

