### RAG Pipeline - Data Ingestion to vector DB piplline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


## Read all the pdf's inside the directory
def process_all_pdfs(pdf_direcotry):
    """Process all PDF files in the specified directory."""
    all_documents = []
    pdf_dir = Path(pdf_direcotry)

    #Find all PDF files in the directory
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files in the directory: {pdf_direcotry} to process.")

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")

        try:
            # Load the PDF document
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            #Add source information to metadata
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"

            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages.")

        except Exception as e:
            print(f"Error processing file {pdf_file}: {e}")
    
    print(f"Total documents loaded from all PDFs: {len(all_documents)}")
    return all_documents

#Process all pdfs in data directory
all_pdf_documents = process_all_pdfs("../data")

  from .autonotebook import tqdm as notebook_tqdm


Found 1 PDF files in the directory: ../data to process.
Processing file: Project Proposal A-M-S.pdf
Loaded 16 pages.
Total documents loaded from all PDFs: 16


In [2]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}, page_content='TRIBHUV AN UNIVERSITY \nInstitute of Science and Technology \n \n \nA Project Proposal \nOn \n"E-Voting System" \n \nSubmitted to \nDepartment of Statistics and Computer Science \nPatan Multiple Campus \n \nIn partial fulfillement of the requriments for Bachelor Degree in Computer \nscience and Information Technology \n \n \nSubmitted By: \nAshutosh Adhikari (79010020) \nManish Basnet (79010054) \nSnehal Sigdel (79010119) \n \nDate: \n1st Feb 2026'),
 Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ash

In [3]:
### Text splitting

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks.")

    #show example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:500]}.....")  # Print the first 500 characters of the first chunk
        print(f"Metadata: {split_docs[0].metadata}")
        
    print(f"Total chunks created: {len(split_docs)}")
    return split_docs

In [4]:
chunks = split_documents(all_pdf_documents)
chunks

Split 16 documents into 31 chunks.

Example chunk:
Content: TRIBHUV AN UNIVERSITY 
Institute of Science and Technology 
 
 
A Project Proposal 
On 
"E-Voting System" 
 
Submitted to 
Department of Statistics and Computer Science 
Patan Multiple Campus 
 
In partial fulfillement of the requriments for Bachelor Degree in Computer 
science and Information Technology 
 
 
Submitted By: 
Ashutosh Adhikari (79010020) 
Manish Basnet (79010054) 
Snehal Sigdel (79010119) 
 
Date: 
1st Feb 2026.....
Metadata: {'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}
Total chunks created: 31


[Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}, page_content='TRIBHUV AN UNIVERSITY \nInstitute of Science and Technology \n \n \nA Project Proposal \nOn \n"E-Voting System" \n \nSubmitted to \nDepartment of Statistics and Computer Science \nPatan Multiple Campus \n \nIn partial fulfillement of the requriments for Bachelor Degree in Computer \nscience and Information Technology \n \n \nSubmitted By: \nAshutosh Adhikari (79010020) \nManish Basnet (79010054) \nSnehal Sigdel (79010119) \n \nDate: \n1st Feb 2026'),
 Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ash

## Embeddings and VectorstoreDB

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity



In [6]:
class EmbeddingManager:

    """Handles document embedding generation using SentenceTransformer models."""
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the embedding manager 
        Args:
            model_name: Hugging face model name for sentence embedding generation."""
        self.model_name = model_name
        self.model = None
        self.load_model()
        print(f"Initialized embedding model: {model_name}")

    def load_model(self):
        """Load the sentence transformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.
        Args:
            texts: List of strings to generate embeddings for.
        Returns:
            Numpy array of embeddings with shape (len(texts), embedding_dimension)."""
        
        if not self:
            raise ValueError("Model not loaded. Call load_model() before generating embeddings.")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print("Embeddings generated.")
        return embeddings
    
    # def get_embedding_dimension(self) -> int:
    #     """Get the dimension of the embeddings generated by the model."""
    #     if not self.model:
    #         raise ValueError("Model not loaded. Call load_model() before getting embedding dimension.")
    #     return self.model.get_sentence_embedding_dimension()

## Inirilizing the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 185.84it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384
Initialized embedding model: all-MiniLM-L6-v2


<__main__.EmbeddingManager at 0x21420bfb050>

## Vector Store

In [7]:
import os
class vector_store:

    """Manages the ChromaDB vector store for document embeddings."""
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the vector store.
        Args:
            collection_name: Name of the ChromaDB collection to use.
            persist_directory: Directory where the vector store is persisted."""
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.initialize_vector_store()

    def initialize_vector_store(self):
        """Initialize the ChromaDB client and collection."""
        try:
            # Create the ChromaDB client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create the collection
            self.collection = self.client.get_or_create_collection (
                name=self.collection_name,
                metadata = {
                    "description": "PDF documents embeddings for RAG"})
            
            print(f"Vector store initialized with collection: {self.collection_name} at {self.persist_directory}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:

            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any],embeddings: np.ndarray):
        """Add documents to the vector store.
        Args:
            documents: List of langchain documens
            embedings: Corresponding embeddings for the documents."""
        
        if len(documents) != len(embeddings):
            raise ValueError("The number of documents and embeddings must be the same.")
        
        print(f"Adding {len(documents)} documents to the vector store...")

        # Prepare data for chromadb
        ids=[]
        metdatas=[]
        documents_text =[]
        embedding_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"{uuid.uuid4().hex[:8]}_{i}"  # Generate a unique ID for each document
            ids.append(doc_id)

            #prepare metadata
            metadata= dict(doc.metadata)  # Copy existing metadata
            metadata['doc_index'] = i  # Add document index to metadata
            metadata['content_length'] = len(doc.page_content)  # Add content length to metadata
            metdatas.append(metadata)

            #Document content
            documents_text.append(doc.page_content)

            # Embeddings
            embedding_list.append(embedding.tolist())  # Convert numpy array to list for chromadb   
        #Add to collection

        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metdatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to the vector store.\nTotal documents in collection: {self.collection.count()}")   

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=vector_store()
vectorstore

Vector store initialized with collection: pdf_documents at ../data/vector_store
Existing documents in collection: 31


<__main__.vector_store at 0x21420ce78d0>

In [8]:
chunks

[Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ashutosh Adhikari', 'moddate': '2026-01-31T21:58:09+05:45', 'source': '..\\data\\pdf\\Project Proposal A-M-S.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', 'source_file': 'Project Proposal A-M-S.pdf', 'file_type': 'pdf'}, page_content='TRIBHUV AN UNIVERSITY \nInstitute of Science and Technology \n \n \nA Project Proposal \nOn \n"E-Voting System" \n \nSubmitted to \nDepartment of Statistics and Computer Science \nPatan Multiple Campus \n \nIn partial fulfillement of the requriments for Bachelor Degree in Computer \nscience and Information Technology \n \n \nSubmitted By: \nAshutosh Adhikari (79010020) \nManish Basnet (79010054) \nSnehal Sigdel (79010119) \n \nDate: \n1st Feb 2026'),
 Document(metadata={'producer': 'Microsoft¬Æ Word 2019', 'creator': 'Microsoft¬Æ Word 2019', 'creationdate': '2026-01-31T21:58:09+05:45', 'author': 'Ash

In [9]:
# convert the text to embedddings
texts= [doc.page_content for doc in chunks]
texts    


['TRIBHUV AN UNIVERSITY \nInstitute of Science and Technology \n \n \nA Project Proposal \nOn \n"E-Voting System" \n \nSubmitted to \nDepartment of Statistics and Computer Science \nPatan Multiple Campus \n \nIn partial fulfillement of the requriments for Bachelor Degree in Computer \nscience and Information Technology \n \n \nSubmitted By: \nAshutosh Adhikari (79010020) \nManish Basnet (79010054) \nSnehal Sigdel (79010119) \n \nDate: \n1st Feb 2026',
 'ii \n \nTable of Contents \n1. Introduction .................................................................................................................. 1 \n2. Problem Statement ....................................................................................................... 1 \n3. Objectives .................................................................................................................... 1 \n4. Methodology ....................................................................................................

In [10]:

# convert the text to embedddings
texts= [doc.page_content for doc in chunks]

# Generate embedding for the chunks
embedding = embedding_manager.generate_embeddings(texts)

# Store in the vectore database
vectorstore.add_documents(chunks, embedding)



Generating embeddings for 31 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:06<00:00,  6.84s/it]


Embeddings generated.
Adding 31 documents to the vector store...
Successfully added 31 documents to the vector store.
Total documents in collection: 62


## Retriever pipeline from VectorStore

In [11]:
class RAGRetriever:

    """Retrieves relevant documents from the vector store based on a query."""
    def __init__(self, vector_store: vector_store, embedding_manager: EmbeddingManager):
        """Initialize the retriever with the vector store and embedding manager.
        Args:
            vector_store: Instance of the vector store to retrieve documents from.
            embedding_manager: Instance of the embedding manager to generate query embeddings."""
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5,score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """Retrieve relevant documents based on the query.
        Args:
            query: The input query string to search for relevant documents.
            top_k: The number of top relevant documents to retrieve.
            score_threshold: Minimum cosine similarity score to consider a document relevant.
        Returns:
            A list of dictionaries containing the retrieved documents and their metadata."""
        
        print(f"Retrieving documents for query: '{query}' with top_k={top_k}...")
        # print(f"Score threshold: {score_threshold}  ")

        # Generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]  # Get the embedding vector

        # Retrieve all documents and their embeddings from the vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings =[query_embedding.tolist()],  # Convert numpy array to list for chromadb
                n_results=top_k  # Retrieve more results to apply score thresholding
            )
            #process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents= results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i,(doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                        
                    # Convert distance to similarity score (assuming distance is cosine distance)
                    similarity_score = 1 - distance  # Convert cosine distance to similarity score

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "content": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            "distance": distance,
                            "rank": i + 1  # Rank starts from 1
                        })
                print(f"Retrieved {len(retrieved_docs)} relevant documents after applying score threshold.")
            else:
                print("No documents retrieved from vector store.")
            return retrieved_docs
        except Exception as e:
            print(f"Error retrieving documents: {e}")
            return []

rag_retriever = RAGRetriever(vectorstore, embedding_manager)



        

In [12]:
rag_retriever

<__main__.RAGRetriever at 0x21420d44710>

In [13]:
rag_retriever.retrieve("Functional requirements")

Retrieving documents for query: 'Functional requirements' with top_k=5...
Generating embeddings for 1 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 22.78it/s]

Embeddings generated.
Retrieved 0 relevant documents after applying score threshold.





[]

## Integration Vectordb context pipeline with LLM Output

In [None]:
## Simple RAG pipeline with Groq LLM
from dotenv import load_dotenv
from langchain_groq import GroqChat
import os
load_dotenv()

## Initialize the Groq LLM (set your API key in the .env file)
groq_api_key = os.getenv("GROQ_API_KEY")

groq_llm = GroqChat(api_key=groq_api_key, model="gemma2-9b-it",temperature=0.1,max_tokens=1024)

#2. Create a simple RAG function
def rag_simple(query,retriever,llm,top_k=3):
    #Retrieve the documents
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant information found in the documents."
    
    #Generate the answer using groq llm
    prompt= f"""Use the following contxt to answer the question concisely.
            Context:
            {context}
            
            Question: {query}
            Answer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content


SyntaxError: incomplete input (2665914346.py, line 13)