In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

text = "The quick brown fox jumps over the lazy dog."
embedding = hf_embeddings.embed_query(text)
print(f"Embedding for the text '{text}':")
print(len(embedding))

  from .autonotebook import tqdm as notebook_tqdm


Embedding for the text 'The quick brown fox jumps over the lazy dog.':
384


In [6]:
from sklearn.metrics.pairwise import cosine_similarity


documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast brown fox leaps over a lazy canine.",
    "An entirely different sentence that does not match."
]
my_query = "A quick brown fox jumps over a lazy dog."
query_embedding = hf_embeddings.embed_query(my_query)
document_embeddings = hf_embeddings.embed_documents(documents)
cs = cosine_similarity([query_embedding], document_embeddings)
similarities = []
for i, doc in enumerate(documents):
    similarities.append((doc, cs[0][i]))

print("\nSimilarities between the query and documents:")
for doc, sim in similarities:
    print(f"Document: {doc} | Similarity: {sim:.4f}")



Similarities between the query and documents:
Document: The quick brown fox jumps over the lazy dog. | Similarity: 0.9658
Document: A fast brown fox leaps over a lazy canine. | Similarity: 0.9579
Document: An entirely different sentence that does not match. | Similarity: -0.0021


In [7]:
from sklearn.metrics.pairwise import euclidean_distances

ed = euclidean_distances([query_embedding], document_embeddings)
print("\nEuclidean distances between the query and documents:")
for i, doc in enumerate(documents):
    print(f"Document: {doc} | Distance: {ed[0][i]:.4f}")


Euclidean distances between the query and documents:
Document: The quick brown fox jumps over the lazy dog. | Distance: 0.2617
Document: A fast brown fox leaps over a lazy canine. | Distance: 0.2902
Document: An entirely different sentence that does not match. | Distance: 1.4157


#### FAISS

In [19]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


In [20]:
index = faiss.IndexFlatL2(384)  # 384 is the dimension of the embeddings

In [21]:
FAISS_store = FAISS(
    index=index,
    embedding_function=hf_embeddings,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [22]:
FAISS_store.add_texts(["AI is transforming the world.", "Machine learning is a subset of AI.", "This is a completely different sentence."])

['de23fe99-1091-47dd-abde-f665bc7b506b',
 'a22e72a7-9eef-4e8e-bbe1-06f8503bf319',
 'c89ac125-bd0e-4fb6-89b5-2c6ebcf13e5a']

In [23]:
FAISS_store.index_to_docstore_id

{0: 'de23fe99-1091-47dd-abde-f665bc7b506b',
 1: 'a22e72a7-9eef-4e8e-bbe1-06f8503bf319',
 2: 'c89ac125-bd0e-4fb6-89b5-2c6ebcf13e5a'}

In [24]:
FAISS_store.similarity_search("What is AI?", k=2)

[Document(id='de23fe99-1091-47dd-abde-f665bc7b506b', metadata={}, page_content='AI is transforming the world.'),
 Document(id='a22e72a7-9eef-4e8e-bbe1-06f8503bf319', metadata={}, page_content='Machine learning is a subset of AI.')]

# 🔍 Vector Index Types: Flat vs IVF vs HNSW

Vector databases use different indexing methods for similarity search. Here's a detailed comparison:

| Feature                     | Flat (Brute Force)                        | IVF (Inverted File Index)                             | HNSW (Hierarchical Navigable Small World)             |
|----------------------------|-------------------------------------------|--------------------------------------------------------|--------------------------------------------------------|
| 🔧 Index Type              | Exhaustive Search                         | Quantization + Clustering                             | Graph-Based Search                                     |
| ⚡ Speed                   | Slowest (linear scan)                     | Faster (search within clusters)                       | Fastest (logarithmic-like via graph traversal)        |
| 🎯 Accuracy               | 100% (Exact)                              | Approximate (depends on `nlist` & `nprobe`)           | Approximate but high-quality                          |
| 📦 Memory Usage           | High (stores all vectors)                 | Moderate (stores centroids + data)                   | Low (compact graph structure)                         |
| 📊 Data Size Range        | ✅ Small (≤ 10K vectors)                  | ✅ Medium (10K–1M vectors)                            | ✅ Large (100K–100M+ vectors)                         |
| 🧠 Use Case Suitability    | Small datasets, highest accuracy needed   | Medium datasets, balance of speed and accuracy        | Large-scale datasets, low-latency requirements        |
| 🏗️ Build Time             | None (no preprocessing)                   | Requires clustering (e.g., K-means)                   | Slower (graph construction)                           |
| 🔄 Insert/Delete Support   | Easy (no structure to update)             | Moderate (needs re-clustering or partial rebuild)     | Hard (graph needs rebalancing)                        |
| 🛠️ Tunable Parameters     | None                                      | `nlist`, `nprobe`                                     | `ef_construction`, `M`, `ef_search`                   |

## 🧪 Summary

- **Flat**: Best for exact search on small datasets.
- **IVF**: Good balance of speed vs accuracy using clustering.
- **HNSW**: Optimal for large-scale ANN with low latency.

> ✅ Tip: Start with Flat for prototyping, switch to IVF or HNSW for production-scale workloads.

# 🧠 FAISS Index Selection Guide by Dataset Size

Choosing the right FAISS index depends on the number of vectors, latency requirements, and available compute/memory. Below is a practical guide:

| Dataset Size       | Recommended Index Type(s)         | Description                                                                 | Accuracy      | Speed     | Tunable Params                  |
|--------------------|-----------------------------------|------------------------------------------------------------------------------|---------------|-----------|----------------------------------|
| 🔹 ≤ 10K vectors    | `IndexFlatL2`, `IndexFlatIP`      | Exact search. No indexing overhead. Ideal for small datasets or prototyping. | ✅ 100% exact | ❌ Slow    | None                             |
| 🔸 10K – 100K       | `IndexIVFFlat`, `IndexIVFPQ`      | IVF with flat or product quantization. Needs training (KMeans).              | ⚠️ Approx.    | ✅ Fast    | `nlist`, `nprobe`, `m`, `nbits` |
| 🟠 100K – 1M        | `IndexIVFPQ`, `IndexIVFSQ8`       | IVF + Product/SQ quantization. Great tradeoff between memory and speed.      | ⚠️ Approx.    | ✅ Fast    | `nlist`, `nprobe`, `m`          |
| 🔴 1M – 10M         | `IndexIVFPQ`, `IndexHNSWFlat`     | IVF-PQ for memory efficiency; HNSW for fast recall in high dimension.        | ⚠️ Approx.    | ⚡ Very fast | `efConstruction`, `M`           |
| 🔵 10M – 100M+      | `IndexHNSWFlat`, `IndexIVF_HNSW`  | HNSW variants scale well, retain high accuracy.                              | ⚠️ Approx.    | ⚡⚡ Ultra-fast | `efConstruction`, `efSearch`  |

## 🔧 Notes

- **IndexFlatL2**: Exact search with L2 distance; stores all vectors as-is.
- **IndexIVFFlat**: Clusters vectors into `nlist` partitions. Searches `nprobe` clusters.
- **IndexIVFPQ**: IVF + product quantization for memory savings.
- **IndexIVFSQ8**: IVF + scalar quantization.
- **IndexHNSWFlat**: Graph-based, excellent for low-latency ANN.
- **IndexIVF_HNSW**: Combines IVF clustering with HNSW graph traversal.

> 🚀 Tip: Always train IVF-based indexes with representative training data using `.train()`.

## 🧪 Prototyping vs Production

- Start with `IndexFlat*` to evaluate performance.
- Scale to `IVFPQ` or `HNSWFlat` for millions of vectors.

In [25]:
from langchain_core.documents import Document

doc1 = Document(
    page_content="AI is the simulation of human intelligence in machines.",
    metadata={"source": "doc1", "author": "John Doe"}
)
doc2 = Document(
    page_content="Machine learning is a branch of AI that focuses on the development of algorithms that allow computers to learn from and make predictions based on data.",
    metadata={"source": "doc2", "author": "Jane Smith"}
)
doc3 = Document(
    page_content="Natural language processing is a field of AI that enables machines to understand and interpret human language.",
    metadata={"source": "doc3", "author": "Alice Johnson"}
)
doc4 = Document(
    page_content="Deep learning is a subset of machine learning that uses neural networks to model complex patterns in data.",
    metadata={"source": "doc4", "author": "Bob Brown"}
)
doc5 = Document(
    page_content="Reinforcement learning is an area of machine learning where an agent learns to make decisions by taking actions in an environment to maximize cumulative reward.",
    metadata={"source": "doc5", "author": "Charlie Davis"}
)
doc6 = Document(
    page_content="Computer vision is a field of AI that enables machines to interpret and understand visual information from the world.",
    metadata={"source": "doc6", "author": "Eve White"}
)
doc7 = Document(
    page_content="Robotics is an interdisciplinary field that integrates AI, machine learning, and engineering to design and build intelligent machines.",
    metadata={"source": "doc7", "author": "Frank Green"}
)
doc8 = Document(
    page_content="AI ethics is a field that examines the ethical implications and societal impact of AI technologies.",
    metadata={"source": "doc8", "author": "Grace Black"}
)

docs = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8]

In [26]:
index = faiss.IndexFlatL2(384)  # 384 is the dimension of the embeddings

FAISS_store = FAISS(
    index=index,
    embedding_function=hf_embeddings,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [27]:
FAISS_store.add_documents(docs)

['4291a834-63a6-453e-9ef3-16496b604d63',
 '225dcd1c-da2b-4ef8-a4f3-a6ef55b44e47',
 '5dff80a6-72d7-4576-817b-d51986661e75',
 'f198f259-563d-4eda-a60c-5d64137a7167',
 'af92574d-d68c-493e-b3ec-9c5c87d9b0be',
 'c2421d57-1676-411b-a691-da6e3fbc1d04',
 'e3e3b92d-b20f-449c-b037-35e97bd3a871',
 'cb1a5da9-f1ff-44a0-a9c3-6c8386aa74eb']

In [28]:
FAISS_store.similarity_search("What is AI?", k=3)

[Document(id='4291a834-63a6-453e-9ef3-16496b604d63', metadata={'source': 'doc1', 'author': 'John Doe'}, page_content='AI is the simulation of human intelligence in machines.'),
 Document(id='cb1a5da9-f1ff-44a0-a9c3-6c8386aa74eb', metadata={'source': 'doc8', 'author': 'Grace Black'}, page_content='AI ethics is a field that examines the ethical implications and societal impact of AI technologies.'),
 Document(id='225dcd1c-da2b-4ef8-a4f3-a6ef55b44e47', metadata={'source': 'doc2', 'author': 'Jane Smith'}, page_content='Machine learning is a branch of AI that focuses on the development of algorithms that allow computers to learn from and make predictions based on data.')]

In [34]:
FAISS_store.similarity_search(
    "What is AI?", 
    k=3,
    filter={"author": {"$eq": "John Doe"}},
)

[Document(id='4291a834-63a6-453e-9ef3-16496b604d63', metadata={'source': 'doc1', 'author': 'John Doe'}, page_content='AI is the simulation of human intelligence in machines.')]

In [36]:
retriever=FAISS_store.as_retriever(search_kwargs={"k": 3})

In [38]:
retriever.invoke("What is AI?")

[Document(id='4291a834-63a6-453e-9ef3-16496b604d63', metadata={'source': 'doc1', 'author': 'John Doe'}, page_content='AI is the simulation of human intelligence in machines.'),
 Document(id='cb1a5da9-f1ff-44a0-a9c3-6c8386aa74eb', metadata={'source': 'doc8', 'author': 'Grace Black'}, page_content='AI ethics is a field that examines the ethical implications and societal impact of AI technologies.'),
 Document(id='225dcd1c-da2b-4ef8-a4f3-a6ef55b44e47', metadata={'source': 'doc2', 'author': 'Jane Smith'}, page_content='Machine learning is a branch of AI that focuses on the development of algorithms that allow computers to learn from and make predictions based on data.')]

In [39]:
# store the FAISS index to disk
FAISS_store.save_local("/Users/amarmandal/Documents/coding/GenAI_Course/data/faiss_index")

In [41]:
# Load the FAISS index from disk
FAISS_store_loaded = FAISS.load_local(
    "/Users/amarmandal/Documents/coding/GenAI_Course/data/faiss_index",
    hf_embeddings,
    allow_dangerous_deserialization=True
)
retriever_loaded = FAISS_store_loaded.as_retriever(search_kwargs={"k": 3})
retriever_loaded.invoke("What is AI?")

[Document(id='4291a834-63a6-453e-9ef3-16496b604d63', metadata={'source': 'doc1', 'author': 'John Doe'}, page_content='AI is the simulation of human intelligence in machines.'),
 Document(id='cb1a5da9-f1ff-44a0-a9c3-6c8386aa74eb', metadata={'source': 'doc8', 'author': 'Grace Black'}, page_content='AI ethics is a field that examines the ethical implications and societal impact of AI technologies.'),
 Document(id='225dcd1c-da2b-4ef8-a4f3-a6ef55b44e47', metadata={'source': 'doc2', 'author': 'Jane Smith'}, page_content='Machine learning is a branch of AI that focuses on the development of algorithms that allow computers to learn from and make predictions based on data.')]

### Excercise

In [42]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = PyPDFLoader("/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
final_documents = text_splitter.split_documents(data)
len(final_documents)

53

In [43]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

index = faiss.IndexFlatL2(384)  # 384 is the dimension of the embeddings
FAISS_store = FAISS(
    index=index,
    embedding_function=hf_embeddings,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [44]:
FAISS_store.add_documents(final_documents)

['81922eef-b11d-4dc5-a986-3c6af54e5475',
 '0b53c416-bde5-44f5-9bec-221b1b359b06',
 '8d9d27e1-faae-4546-81c9-ef8fc52b0452',
 '1ad4b570-ef0a-4555-bd38-b9fd7f699c8f',
 'e83a9d70-0a86-407a-a03b-e41239aa6de6',
 'f7e1bfe4-1765-4100-b79f-681913f4ee4e',
 'a45d1a03-cc12-4ac2-9424-1d4123f924b1',
 'bd0b44ce-07ba-4fa2-a727-f948c4d8e5e2',
 'e1b971b9-806e-4c4c-9989-20402ba4ba6a',
 'f392ff94-d4fd-4a8b-94d3-8083c7607adb',
 '1063b4e8-6a0d-4080-9dce-081254abb2c2',
 '72c774f1-6214-433a-95d4-7759bc49ccaf',
 'd71707d3-663f-4660-9484-4ff7d066be40',
 '5945cec1-c042-4fc8-8286-731555456e1d',
 '793e5e84-7d63-4a7a-8d21-02b363a7ebd6',
 'ebf5c500-ff52-4213-a00e-27761998c716',
 'eb540a53-a353-45d1-9793-8ccfd8d18a18',
 '675bdf6d-c266-469e-bc9e-d73e4fb212cb',
 'c50e8a33-e370-4c4e-b7b3-f25cffdea54f',
 'a5b21de6-2620-495c-a47c-cd0f5072fdb2',
 'a2759068-b97c-488c-b72d-539fee49dccc',
 '57756842-352a-4b93-bae0-449214e1944e',
 '088da7cb-d03c-43bd-aeef-7ed17584d906',
 '32ed9559-4d56-4b8e-8390-b557d1e7d8d0',
 '1ede539a-784b-

In [46]:
retriever = FAISS_store.as_retriever(search_kwargs={"k": 3})
retriever.invoke("What are we going to learn in this course?")

[Document(id='0b53c416-bde5-44f5-9bec-221b1b359b06', metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 1, 'page_label': '2'}, page_content='This course is designed for AI developers, machine learning engineers, data scientists, and\nsoftware engineers looking to build expertise in agentic AI, multi-agent systems, and AI-powered\nautomation. Whether you are new to AI agents or have experience in NLP and GenAI, this course\nwill equip you with the knowledge and hands-on skills required to develop, deploy, and manage AI\nagents at scale. By the end of the course, you will have a strong foundation in agentic AI'),
 Document(id='e83a9d70-0a86-407a-a03b-e41239aa6de6', metadata={'producer': 'Canva', 'creato