## FAISS Vector Storage

In [1]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

print(f"FAISS version: {faiss.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


FAISS version: 1.13.1


In [2]:
# Sample documents
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks.",
    "Data visualization helps communicate insights from complex datasets.",
    "Cloud computing provides on-demand access to computing resources.",
    "Cybersecurity protects systems and networks from digital attacks.",
    "Blockchain technology enables secure, decentralized transactions.",
    "Quantum computing uses quantum mechanics to solve complex problems."
]

In [3]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents)

print(f"Generated {len(embeddings)} embeddings")

Generated 10 embeddings


### FAISS Index

In [4]:
dimension = embeddings.shape[1]
#Create FAISS index (IndexFlatL2 = exact search with L2 distance)
index = faiss.IndexFlatL2(dimension)

index.add(embeddings)
print(f"Total vectors in index: {index.ntotal}")

Total vectors in index: 10


### Search with FAISS

In [5]:
query = "What is artificial intelligence and machine learning?"
query_embedding = model.encode([query])

k=3
distances, indices = index.search(query_embedding, k)

print(f"Query: {query}\n")
print(f"Top {k} results:\n")

for i, (idx, distance) in enumerate(zip(indices[0], distances[0]),1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"    {documents[idx]}")
    print()



Query: What is artificial intelligence and machine learning?

Top 3 results:

1. (Distance: 0.9079)
    Deep learning is a subset of machine learning using multi-layered neural networks.

2. (Distance: 1.2202)
    Machine learning models require large amounts of training data to perform well.

3. (Distance: 1.2355)
    Natural language processing enables computers to understand human language.



### Using Cosine Similarity with FAISS

In [6]:
# Normalize embeddings for cosine similarity
embeddings_normalized = embeddings/np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create index with inner product (equivalent to cosine for normalized vectors)
index_cosine = faiss.IndexFlatIP(dimension)
index_cosine.add(embeddings_normalized)

#Search with normalized query
query_embedding_normalized = query_embedding/np.linalg.norm(query_embedding)
scores, indices = index_cosine.search(query_embedding_normalized, k=3)

print(f"Query: {query}\n")
print(f"Top {k} results with cosine similarity:\n")

for i, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
    print(f"{i}. (Similarity: {score:.4f})")
    print(f"   {documents[idx]}")
    print()

Query: What is artificial intelligence and machine learning?

Top 3 results with cosine similarity:

1. (Similarity: 0.5460)
   Deep learning is a subset of machine learning using multi-layered neural networks.

2. (Similarity: 0.3899)
   Machine learning models require large amounts of training data to perform well.

3. (Similarity: 0.3823)
   Natural language processing enables computers to understand human language.



### Saving and Loading FAISS index

In [8]:
#Save index to disk
faiss.write_index(index_cosine, "my_faiss_index.bin")
print("Index saved to disk")

# Save documents separately (FAISS only stores vectors not text)
import pickle
with open("documents.pkl", "wb") as f:
    pickle.dump(documents, f)
print("Documents saved")

Index saved to disk
Documents saved


In [10]:
# Load index from disk
loaded_index = faiss.read_index("my_faiss_index.bin")
print(f"Index loaded: {loaded_index.ntotal} vectors")

# Load documents 
with open("documents.pkl", "rb") as f:
    loaded_documents = pickle.load(f)
print(f"Documents loaded: {len(loaded_documents)} documents")

Index loaded: 10 vectors
Documents loaded: 10 documents


## Chroma Vector Database

In [1]:
import chromadb
print(f"Chromadb version: {chromadb.__version__}")

Chromadb version: 1.3.7


In [2]:
#Create Chroma client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = client.get_or_create_collection(
    name="my_documents",
    metadata={"description": "Sample document collection"}

)
print(collection.name)
print(f"Current count: {collection.count()} documents")

my_documents
Current count: 0 documents


### Add Documents to Chroma

In [4]:

documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks."
]

# metadata for each document
metadatas = [
    {"category": "programming", "topic": "python"},
    {"category": "AI", "topic": "machine learning"},
    {"category": "AI", "topic": "neural networks"},
    {"category": "AI", "topic": "NLP"},
    {"category": "AI", "topic": "deep learning"}
]

#IDs for each document
ids = [f"doc_{i}" for i in range(len(documents))]

# Add to collection (Chroma handles embedding automatically!)
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids = ids
)

print(f"Total documents: {collection.count()}")

C:\Users\owner\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [23:16<00:00, 59.6kiB/s]  


Total documents: 5


### Query Chroma


In [6]:
results = collection.query(
    query_texts = ['What is artificial intelligence?'],
    n_results =3
)

print("Query: What is artificial intelligence?\n")
print("Top 3 results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0], # we used [0] to extract the response for the first query because chromadb gives its response as a list of list, in order to be able to isolate the response for each query.
    results['metadatas'][0],
    results['distances'][0]
,
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()

Query: What is artificial intelligence?

Top 3 results:

1. (Distance: 1.1505)
   Document: Deep learning is a subset of machine learning using multi-layered neural networks.
   Metadata: {'topic': 'deep learning', 'category': 'AI'}

2. (Distance: 1.2408)
   Document: Natural language processing enables computers to understand human language.
   Metadata: {'topic': 'NLP', 'category': 'AI'}

3. (Distance: 1.2560)
   Document: Neural networks are inspired by the structure of the human brain.
   Metadata: {'category': 'AI', 'topic': 'neural networks'}



In [7]:
results['documents']

[['Deep learning is a subset of machine learning using multi-layered neural networks.',
  'Natural language processing enables computers to understand human language.',
  'Neural networks are inspired by the structure of the human brain.']]

### Filtering with Metadata

In [8]:
results = collection.query(
    query_texts=["Tell me about AI"],
    n_results=3,
    where={"category": "AI"} # only returns documents in the AI category
)

print("Query: Tell me about AI (filtered by category='AI')\n")
print("Results:\n")

for i, (doc, metadata) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0]
),1):
    print(f"{i}. {doc}")
    print(f"    Categpry: {metadata["category"]}, Topic: {metadata["topic"]}")
    print()

Query: Tell me about AI (filtered by category='AI')

Results:

1. Deep learning is a subset of machine learning using multi-layered neural networks.
    Categpry: AI, Topic: deep learning

2. Natural language processing enables computers to understand human language.
    Categpry: AI, Topic: NLP

3. Neural networks are inspired by the structure of the human brain.
    Categpry: AI, Topic: neural networks



### Using Custom Embedding Function

In [9]:
from chromadb.utils import embedding_functions

#Use sentence-transformers embedding functions
sentence_trans_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name= "all-MiniLM-L6-v2"
)

# Create new collection with custom embedding function
collection_custom = client.get_or_create_collection(
    name = "custom_embeddings",
    embedding_function=sentence_trans_ef
)

# Add documents
collection_custom.add(
    documents = documents,
    metadatas=metadatas,
    id=ids
)

print(f"Collection with custom embeddings created")
print(f"Documnts: {collection_custom.count()}")

TypeError: Collection.add() got an unexpected keyword argument 'id'. Did you mean 'ids'?

In [None]:
#Query the collection
results = collection_custom.query(
    query_texts=["What is aritifical intelligenc?"],
    n_results=3,
    include=["embeddings", "documents", "metadatas", "distances"]
)

NameError: name 'collection_custom' is not defined

In [None]:
results

In [None]:
# Query the collection
results = collection_custom.query(
    query_texts=["What is artificial intelligence?"],
    n_results=3,
    include=["embeddings", "documents", "metadatas", "distances"]
)

print("Query: What is artificial intelligence?\n")
print("Top 3 results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()

### Update and delete documents

In [None]:
# Update a document
collection.update(
    ids=["doc_0"],
    documents=["Python is an amazing programming language for AI and data science!"],
    metadatas=[{"category": "programming", "topic": "python", "updated": True}]
)
print("Document updated")

# #Delete a document
# collection.delete(ids=["doc_4"])
# print("Document deleted")

print(f"Total documents after updatae {collection.count()}")


## Building  a Complete RAG Retriever

### RAG Retriever with Chroma

In [10]:
import re

class RAGRetriever:
    def __init__(self, collection_name="rag_collection", persist_dir="./rag_db"):
        """
        Initialize RAG retriever with Chroma.
        """
        
        #Create chroma client
        self.client = chromadb.PersistentClient(path=persist_dir)
        #Create collection with sentence-transformers
        embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )

        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_fn
        )

        print(f"RAG Retriever initialized")
        print(f"Collection: {collection_name}")
        print(f"Current documents: {self.collection.count()}")
        print(f"Data persisted to: {persist_dir}/")

    def chunk_text(self, text, chunk_size=500):
        """
        Simple sentence-based chunking from Module 2.
        """
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk += " " + sentence if current_chunk else sentence

        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks
    
    def add_document(self, text, metadata=None, source_name="unknown"):
        """
        Add a document (chunks it automatically).
        """
        #Chunk the document
        chunks = self.chunk_text(text)
        # Prepare data for Chroma
        ids = [f"{source_name}_chunk_{i}" for i in range(len(chunks))]
        metadatas =[
            {
                "source": source_name,
                "chunk_index": i,
                "total_chunks": len(chunks),
                **(metadata or {}) # if metadat is None, it creates an empty dictionary instead of crashing
            }
            for i in range(len(chunks))
        ]

        #Add to collection
        self.collection.add(
            documents=chunks,
            metadatas=metadatas,
            ids=ids
        )
        print(f"Added document '{source_name}': {len(chunks)} chunks")
        return len(chunks)
    
    def retrieve(self, query, top_k=3, filter_metadata=None):
        """
        Retrieve relevant chunks for a query.
        """
        results= self.collection.query(
            query_texts=[query],
            n_results=top_k,
            where=filter_metadata
        )

        return{
            'documents': results['documents'][0],
            'metadatas': results['metadatas'][0],
            'distance': results['distances'][0]
        }
    
    def format_context(self, retrieved_results):
        """
        Format retrieved chunks for LLM prompt.
        """
        context = "Context from retrieved documents: \n\n"

        for i, (doc, metadata, distance) in enumerate(zip(
            retrieved_results['documents'],
            retrieved_results['metadatas'],
            retrieved_results['distances']
        ),1):
            source = metadata.get('source', 'unknown')
            context += f"[{i} ]From {source} (Relevance: {1/(1+distance):.3f}):\n"
            context += f"{doc}\n\n"
        return context
    
    print("RAG Retriever class defined")

RAG Retriever class defined
