In [10]:
### RAG pipeline- Data Ingestion to Vector DB Pipeline

In [11]:
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## read all the pdfs inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents=[]
    pdf_dir=Path(pdf_directory)

    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessig: {pdf_file.name}")
        try:
            loader=PyPDFLoader(str(pdf_file))
            documents=loader.load()

            #ADD SOURCE INFO TO METADATA{optional}
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = "pdf"

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents=process_all_pdfs("../data")

Found 4 PDF files to process

Processig: random_text_1.pdf
Loaded 2 pages

Processig: random_text_2.pdf
Loaded 2 pages

Processig: random_text_3.pdf
Loaded 5 pages

Processig: random_text_4.pdf
Loaded 2 pages

Total documents loaded: 11


In [None]:
all_pdf_documents

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20260128104922', 'source': '..\\data\\pdfs\\random_text_1.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'random_text_1.pdf', 'file_type': 'pdf'}, page_content='The detective lit his last cigarette as the rain poured down... detective lit the last down... rain\ncigarette detective lit down... rain rain detective his as poured his last lit down... as last rain last\ndetective cigarette poured The detective last The the detective the his rain lit poured as the lit\npoured poured poured detective poured poured The poured poured the the cigarette last poured his\nthe his cigarette cigarette\nMagic had returned to the world, and with it, chaos... returned world, it, with the and to with to had\nwith returned the and with with it, world, Magic with it, had had Magic world, and with chaos... had it,\nworld, world, had to returned world, Magic returned and 

In [None]:
#Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG  performance"""
    textt_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len, #tells it to measure text length using Python len.
        separators=["\n\n","\n"," ",""]
    )
    split_docs=textt_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks") #how many documents into how many chunks

    #show example of a chunk
    if split_docs:
        print(f"\nExample chunks:")
        print(f"Content: {split_docs[0].page_content[:200]}...") #first 200 character of the chunks
        print(f"MetaData: {split_docs[0].metadata}") #meta data attached to the chunk

    return split_docs

In [None]:
chunks=split_documents(all_pdf_documents)
chunks

Split 11 documents into 31 chunks

Example chunks:
Content: The detective lit his last cigarette as the rain poured down... detective lit the last down... rain
cigarette detective lit down... rain rain detective his as poured his last lit down... as last rain ...
MetaData: {'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20260128104922', 'source': '..\\data\\pdfs\\random_text_1.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'random_text_1.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20260128104922', 'source': '..\\data\\pdfs\\random_text_1.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'random_text_1.pdf', 'file_type': 'pdf'}, page_content='The detective lit his last cigarette as the rain poured down... detective lit the last down... rain\ncigarette detective lit down... rain rain detective his as poured his last lit down... as last rain last\ndetective cigarette poured The detective last The the detective the his rain lit poured as the lit\npoured poured poured detective poured poured The poured poured the the cigarette last poured his\nthe his cigarette cigarette\nMagic had returned to the world, and with it, chaos... returned world, it, with the and to with to had\nwith returned the and with with it, world, Magic with it, had had Magic world, and with chaos... had it,\nworld, world, had to returned world, Magic returned and 

In [None]:
## Embedding and vectostore db

In [None]:
import os
import sys

# Add PyTorch DLL directory to PATH
torch_lib_path = os.path.join(sys.prefix, 'Lib', 'site-packages', 'torch', 'lib')
if os.path.exists(torch_lib_path):
    os.add_dll_directory(torch_lib_path)
    # Also add to PATH for older Python versions
    os.environ['PATH'] = torch_lib_path + os.pathsep + os.environ['PATH']

# Now import
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
        model_name:HuggingFace model name for sentence embedding
        """
        self.model_name=model_name
        self.model=None
        self._load_model()

    def _load_model(self): #_load_model is a protect function menation only accesible inside the class
        """Load the sentencetansformer model"""
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model loaded succesfully. Mebedding DImension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name}: {e}")
            raise

    def generate_embedding(self,texts:List[str]) -> np.ndarray:
        """
        generate embedding for a list of texts

        args:
            texsts: list of text strings to embed

        returns:
            numpy array of embedding with shape (len(texts), embedding_dim) 
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embedding for {len(texts)} texts....")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
#Initialize the embeddings manager
embedding_manager=EmbeddingManager()
embedding_manager

Loading Embedding model: all-MiniLM-L6-v2




Model loaded succesfully. Mebedding DImension: 384


<__main__.EmbeddingManager at 0x1aec3d10d90>

In [None]:
## VectorStore

In [None]:
class VectorStore:
    """Manages document embeddings in a chromaDB vector Store"""

    def __init__(self,collection_name:str="pdf_documents" ,persist_directory:str="../data/vector_store"): 
        #foldeer path
        #name for your storage box
        self.collection_name = collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            #Create Persistent ChromaDB client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)#creating a clent which will have a refers to chromadb vector store 
            
            #Get or create collection
            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"Description": "PDF document embeddings for RAG"}
            )
            """
            1. self.client.get_or_create_collection(...)

            Asks ChromaDB: "Do you have a collection with this name?"
            If YES → Use the existing one
            If NO → Create a new one

            2. name=self.collection_name

            The name of the collection (e.g., "pdf_documents")

            3. metadata={"description": "PDF document embeddings for RAG"}

            Extra info about what this collection stores
            Like a label: "This box contains PDF embeddings for RAG"

            4. self.collection = ...

            Store the collection so you can use it later
            """
            print(f"Vector store initialized . Collection: {self.collection_name}")
            print(f"Exisiting documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Existing initializing vector store: {e}")
            raise

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        """
        Add documents to their embeddings to the vecotr sotre

        args:
            documents:kist of langchain documents
            embeddings:corresponding embeddings fpr thr documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match nunmber of embeddings")
        #Becoz documents or chunks should be equal to number of embeddings same as chunks
        print(f"Adding {len(documents)} to the vector store")

        #Prepare data for chromaDB
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]

        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            # enumerate counts while you loop and zip make the embedding numer and chunk together in a zip file and 
            # i is the count and (doc,embedding) doc- one documents and embedding- the documents number

            """Generate Uniquie uiversal ID (uuid)"""
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}" #looks like doc_a3f5b2c1_0 ,,hex will converts(uuid) it to letters and numbers
            ids.append(doc_id)

            """Prepare metadata"""
            metadata=dict(doc.metadata) #the doc has laready some info so we copy it first
            metadata['doc_index'] = i # doc number like 0,1,2
            metadata['content_length']=len(doc.page_content)#len of text
            metadatas.append(metadata)

            """Document content"""
            documents_text.append(doc.page_content)

            """Embedding"""
            embeddings_list.append(embedding.tolist()) #chromadb wants regular list


        """ Add to collection"""
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")  # ← Indented!
            print(f"Total documents in collection: {self.collection.count()}")        # ← Indented!

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized . Collection: pdf_documents
Exisiting documents in collection: 0


<__main__.VectorStore at 0x1aec4285690>

In [None]:
chunks

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20260128104922', 'source': '..\\data\\pdfs\\random_text_1.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'random_text_1.pdf', 'file_type': 'pdf'}, page_content='The detective lit his last cigarette as the rain poured down... detective lit the last down... rain\ncigarette detective lit down... rain rain detective his as poured his last lit down... as last rain last\ndetective cigarette poured The detective last The the detective the his rain lit poured as the lit\npoured poured poured detective poured poured The poured poured the the cigarette last poured his\nthe his cigarette cigarette\nMagic had returned to the world, and with it, chaos... returned world, it, with the and to with to had\nwith returned the and with with it, world, Magic with it, had had Magic world, and with chaos... had it,\nworld, world, had to returned world, Magic returned and 

In [None]:
#Converts text to embedding
texts=[doc.page_content for doc in chunks]
texts

#generate the embeddings
embeddings=embedding_manager.generate_embedding(texts)

#store into the vectorDB
vectorstore.add_documents(chunks,embeddings)

Generating embedding for 31 texts....


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]

Generated embeddings with shape: (31, 384)
Adding 31 to the vector store
Successfully added 31 documents to vector store
Total documents in collection: 93





In [None]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)