In [32]:
from langchain_community.document_loaders import PyMuPDFLoader , DirectoryLoader
dir_load = DirectoryLoader(
    "../data/research_data",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader
)
pdf_docs = dir_load.load()
print(pdf_docs)


[Document(metadata={'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'source': '..\\data\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'file_path': '..\\data\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2015-01-18T06:20:31-08:00', 'trapped': '', 'modDate': "D:20150118062031-08'00'", 'creationDate': "D:20150118062031-08'00'", 'page': 0}, page_content='A New Generation of the IMAGIC Image Processing System\nMARIN VAN HEEL, GEORGE HARAUZ,1 AND ELENA V. ORLOVA\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany\nRALF SCHMIDT\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany; and Image Science Software GmbH,\nMecklenburgische Strasse 27, D-14197 Berlin, Germany\nAND\nMICHAEL SCHATZ\nImage Science Software GmbH, Mecklenburgisch

### Splitting Into Chunks

In [2]:
# Text splitting get into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [3]:
chunks=split_documents(pdf_docs)
chunks

Split 459 documents into 1272 chunks

Example chunk:
Content: A New Generation of the IMAGIC Image Processing System
MARIN VAN HEEL, GEORGE HARAUZ,1 AND ELENA V. ORLOVA
Fritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany
RALF...
Metadata: {'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'source': '..\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'file_path': '..\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2015-01-18T06:20:31-08:00', 'trapped': '', 'modDate': "D:20150118062031-08'00'", 'creationDate': "D:20150118062031-08'00'", 'page': 0}


[Document(metadata={'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'source': '..\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'file_path': '..\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2015-01-18T06:20:31-08:00', 'trapped': '', 'modDate': "D:20150118062031-08'00'", 'creationDate': "D:20150118062031-08'00'", 'page': 0}, page_content='A New Generation of the IMAGIC Image Processing System\nMARIN VAN HEEL, GEORGE HARAUZ,1 AND ELENA V. ORLOVA\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany\nRALF SCHMIDT\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany; and Image Science Software GmbH,\nMecklenburgische Strasse 27, D-14197 Berlin, Germany\nAND\nMICHAEL SCHATZ\nImage Science Software GmbH, Mecklenburgische Strasse 27

### Embedding And VectorStore DB

In [4]:
import numpy as np
from sentence_transformers import SentenceTransformer # Embedding model inside this
import chromadb
from chromadb.config import Settings
import uuid # Every record in chromadb have some uuid generated from here
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity # Used when retrieval from chromadb

###

In [5]:
class EmbeddingManager:
    "handles Document embegging generation using SentenceTransformer"
    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
        model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading Embedding Model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")     
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self , texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model Not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

# Initializing embedding Manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding Model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2781598b020>

### VectorStore

In [6]:
import os
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/research_verctorstore"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x27817d7f1d0>

In [7]:
chunks

[Document(metadata={'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'source': '..\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'file_path': '..\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2015-01-18T06:20:31-08:00', 'trapped': '', 'modDate': "D:20150118062031-08'00'", 'creationDate': "D:20150118062031-08'00'", 'page': 0}, page_content='A New Generation of the IMAGIC Image Processing System\nMARIN VAN HEEL, GEORGE HARAUZ,1 AND ELENA V. ORLOVA\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany\nRALF SCHMIDT\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany; and Image Science Software GmbH,\nMecklenburgische Strasse 27, D-14197 Berlin, Germany\nAND\nMICHAEL SCHATZ\nImage Science Software GmbH, Mecklenburgische Strasse 27

In [8]:
# Convert Chunks(text) to embeddings
texts = [doc.page_content for doc in chunks]
texts

['A New Generation of the IMAGIC Image Processing System\nMARIN VAN HEEL, GEORGE HARAUZ,1 AND ELENA V. ORLOVA\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany\nRALF SCHMIDT\nFritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany; and Image Science Software GmbH,\nMecklenburgische Strasse 27, D-14197 Berlin, Germany\nAND\nMICHAEL SCHATZ\nImage Science Software GmbH, Mecklenburgische Strasse 27, D-14197 Berlin, Germany\nReceived May 15, 1995, and in revised form July 6, 1995\nOne of the aims of modern microscopy is to quan-\ntify two-, three-, or even four-dimensional phenom-\nena in biology, medicine, and material sciences. The\nrequirements imposed on software by such data\nprocessing are exempliﬁed by the design consider-\nations of the IMAGIC-5 software system. This sys-\ntem includes facilities for multivariate statistical\nanalysis of large data sets, for correlation averaging\nof two-dimensional crystals, a

In [9]:
embeddings = embedding_manager.generate_embeddings(texts)
embeddings

Generating embeddings for 1272 texts...


Batches: 100%|██████████| 40/40 [00:40<00:00,  1.02s/it]

Generated embeddings with shape: (1272, 384)





array([[-5.4576334e-02, -7.3611058e-02,  1.1846072e-02, ...,
        -9.1028526e-02, -6.5808319e-02, -4.3442845e-02],
       [-6.9167435e-02, -8.8813022e-02, -4.7093339e-02, ...,
         6.7880251e-03, -9.4867438e-02, -5.9643246e-02],
       [-1.1239682e-01, -5.7776168e-02, -6.6124171e-02, ...,
         9.7417782e-05, -3.1140307e-02, -1.1788783e-02],
       ...,
       [-7.6219484e-02, -2.7538482e-03, -8.9150801e-02, ...,
        -4.3679263e-02, -5.3289114e-03, -3.6201995e-02],
       [-2.5439484e-02, -2.1795679e-02, -3.4542613e-02, ...,
        -1.0117769e-02, -3.0124035e-02, -1.9246209e-02],
       [-7.3664434e-02,  1.9939264e-02, -5.6513309e-02, ...,
         1.2294224e-02, -3.7730470e-02, -7.9982188e-03]],
      shape=(1272, 384), dtype=float32)

In [10]:
print(embeddings)

[[-5.4576334e-02 -7.3611058e-02  1.1846072e-02 ... -9.1028526e-02
  -6.5808319e-02 -4.3442845e-02]
 [-6.9167435e-02 -8.8813022e-02 -4.7093339e-02 ...  6.7880251e-03
  -9.4867438e-02 -5.9643246e-02]
 [-1.1239682e-01 -5.7776168e-02 -6.6124171e-02 ...  9.7417782e-05
  -3.1140307e-02 -1.1788783e-02]
 ...
 [-7.6219484e-02 -2.7538482e-03 -8.9150801e-02 ... -4.3679263e-02
  -5.3289114e-03 -3.6201995e-02]
 [-2.5439484e-02 -2.1795679e-02 -3.4542613e-02 ... -1.0117769e-02
  -3.0124035e-02 -1.9246209e-02]
 [-7.3664434e-02  1.9939264e-02 -5.6513309e-02 ...  1.2294224e-02
  -3.7730470e-02 -7.9982188e-03]]


In [11]:
# Store embeddings to vectordb
vectorstore.add_documents(chunks,embeddings)

Adding 1272 documents to vector store...
Successfully added 1272 documents to vector store
Total documents in collection: 1272


### RAG Retrieval Pipeline

In [12]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [15]:
rag_retriever.retrieve("Formulla for Density Gradient Estimation")

Retrieving documents for query: 'Formulla for Density Gradient Estimation'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 66.69it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_e1ea67d5_863',
  'content': '278 \nBIOMETRIC AND BIOMEDICAL IMAGE PROCESSING \nkernel q5 and window radius (bandwidth) h, is defined as \n(12.8) \nAfter density estimation we identify candidate-clusters by using gradient \nascent (hill-climbing) to pinpoint local maxima of the density pn(x). Specif- \nically, the k-nearest neighbors of every point is determined, whereupon each \npoint is linked to the point of highest density among these neighbors (possi- \nbly itself). Upon iteration, this procedure ends up assigning each point to a \nnearby density-maximum, thus carving up the data set in compact and dense \nclumps. \n12.1 1.5.2 Density Gradient Estimation and Mean Shift Method Application of \nthe mean shift leads to the steepest ascent with a varying step size according to \nthe magnitude of the gradient 134, 351 Assuming that the probability density \nfunction p(x) of the pdimensional feature vectors x is multimodal and also \nassuming that a small sphere S, of radius

### Integrate VectorDB Context pipeline with LLM output

In [17]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [28]:
answer=rag_simple("What is particle filter algorithm in detail",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'What is particle filter algorithm in detail'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.83it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





The particle filter algorithm is a widely used technique for tracking objects in dynamic scenes. It is based on the concept of Monte Carlo methods and is particularly useful for tracking human objects. The algorithm involves the following steps:

**Step 0: Initialization Phase**

1. Choose the centroids of the human blobs obtained from the output of morphological transforms as the seed particles.
2. Assign initial attributes to each particle, such as position, velocity, size, chromatic and achromatic values, transparency, shape, and lifetime.

**Step 1: Resampling Phase**

1. Estimate the scale (s) from the blob and the weights.
2. Determine the new particle positions (xirn) using the scale (s).
3. Assign weights (wLm) to each particle based on the importance function.
4. Calculate the posterior density function (P(zk/Zk)).

**Step 2: Dynamics Phase**

1. Update the particle positions using the dynamics model, which describes how the object moves over time.

**Step 3: Weight Update Pha

### Enhanced RAG Pipeline

In [25]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output



In [31]:
# Example usage:
result = rag_advanced("What is Discrete Kalman Filter Algorithm", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'What is Discrete Kalman Filter Algorithm'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 111.11it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: The Discrete Kalman Filter Algorithm is a method used to estimate the state of a process by using a form of feedback control, combining a priori estimates with noisy measurements. It involves two groups of equations: prediction and update. The algorithm recursively updates the estimate of the state and its error covariance, using the previous estimate and new input data.
Sources: [{'source': '..\\research_data\\Image_processing_principles_and_applicat.pdf', 'page': 338, 'score': 0.2808499336242676, 'preview': '316 \nDYNAMIC SCENE ANALYSIS; MOVING OBJECT DETECTION AND TRACKING \nIn formulating the Kalman filter equation, the objective is to determine \nthe equation relating a posteriori state estimate x k  as a h e a r  combination of \na priori estimate xi as \nThe term [ z k  - H X ~ / , Z ~ - ~ ]  \nis called...'}, {'source': '..\\research_data\\Image_processing_principles_and_applicat.pdf', 'page': 337, 'score': 0.23909306526184082, 'preview': 'DlSCRETE KALMAN FILTERING \n32