RAG Pipelines - Data Ingestion to Vector DB Pipeline

In [1]:
import os 
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter  
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read all the PDFs inside a directory
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()
            
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages from {pdf_file.name}")
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")
    print(f"Total documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs('../data')

Found 3 PDF files to process.
Processing file: attention.pdf
Loaded 11 pages from attention.pdf
Processing file: embeddings.pdf
Loaded 83 pages from embeddings.pdf
Processing file: object_detection.pdf
Loaded 88 pages from object_detection.pdf
Total documents loaded: 182


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '..\\data\\PDFs\\attention.pdf', 'file_path': '..\\data\\PDFs\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kai

In [4]:
# Text splitting into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks.")

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [5]:
chunks = split_documents(all_pdf_documents)
chunks

Split 182 documents into 371 chunks.

Example chunk:
Content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz...
Metadata: {'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '..\\data\\PDFs\\attention.pdf', 'file_path': '..\\data\\PDFs\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'attention.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '..\\data\\PDFs\\attention.pdf', 'file_path': '..\\data\\PDFs\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kai

Embeddings and Vectorstore DB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingStore:
    def __init__(self, embedding_model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the embedding store with a specified embedding model and ChromaDB client.
        
        Args:
            model_name: HuggingFace model name for sentence embeddings.
        """
        self.model_name = embedding_model_name
        self.model = None
        self._load_model() 
    
    def _load_model(self):
        """Load the sentence transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        
        Args:
            texts: List of strings to embed.

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dimension).
        """
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get the dimension of the embeddings produced by the model."""
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        return self.model.get_sentence_embedding_dimension()
    
# Initialize the embedding store
embedding_store = EmbeddingStore()
embedding_store
        

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingStore at 0x1feb203c830>

VectoreStore

In [9]:
class VectorStore:
    """Manages storage and retrieval of document embeddings using ChromaDB."""
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store with ChromaDB client and collection.
        
        Args:
            collection_name: Name of the ChromaDB collection.
            persist_directory: Directory to persist the ChromaDB data.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB client and collection."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Collection of PDF document embeddings"}
            )
            print(f"ChromaDB client initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
            raise e
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.
        
        Args:
            documents: List of document objects with 'page_content' and 'metadata'.
            embeddings: numpy array of embeddings corresponding to the documents.
        """
        if len(documents) != embeddings.shape[0]:
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store.")

        # Prepare data for insertion in ChromaDB
        ids = []
        metadatas = []
        document_texts = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate a unique ID for each document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            document_texts.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
        
        # Add to ChromaDB collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=document_texts
            )
            print(f"Successfully added {len(documents)} documents to the vector store.")
            print(f"Total documents in collection after addition: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise e
        
# Initialize the vector store
vector_store = VectorStore()
vector_store

ChromaDB client initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x1feb39baa50>

In [10]:
chunks

[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '..\\data\\PDFs\\attention.pdf', 'file_path': '..\\data\\PDFs\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kai

In [13]:
# converts the chunks to texts for embedding generation
texts = [chunk.page_content for chunk in chunks]

# Generate embeddings for the document chunks
embeddings = embedding_store.generate_embeddings(texts)

# Add the chunks and their embeddings to the vector store
vector_store.add_documents(chunks, embeddings)

Generating embeddings for 371 texts.


Batches: 100%|██████████| 12/12 [00:32<00:00,  2.73s/it]


Generated embeddings with shape: (371, 384)
Adding 371 documents to the vector store.
Successfully added 371 documents to the vector store.
Total documents in collection after addition: 371


Retriever Pipeline From VectorStore

In [24]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store."""

    def __init__(self, vector_store: VectorStore, embedding_store: EmbeddingStore):
        """
        Initialize the retriever

        Args:
            vector_store: vector store containing document embeddings
            embedding_store: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_store = embedding_store

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_store.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
    

rag_retriever = RAGRetriever(vector_store=vector_store, embedding_store=embedding_store)

In [25]:
rag_retriever

<__main__.RAGRetriever at 0x1feb3d0e660>

In [29]:
rag_retriever.retrieve("What is embeddings ?")

Retrieving documents for query: 'What is embeddings ?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 27.09it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_8fca8bb3_56',
  'content': 'What do embeddings actually look like? Here is one single embedding,\nalso called a vector, in three dimensions. We can think of this as a repre-\nsentation of a single element in our dataset. For example, this hypothetical\nembedding represents a single word "fly", in three dimensions. Generally, we\nrepresent individual embeddings as row vectors.\n\x02\n1\n4\n9\n\x03\n(1)\nAnd here is a tensor, also known as a matrix3, which is a multidimensional\ncombination of vector representations of multiple elements. For example, this\ncould be the representation of "fly", and "bird."\n\x141\n4\n9\n4\n5\n6\n\x15\n(2)\nThese embeddings are the output of the process of learning embeddings,\nwhich we do by passing raw input data into a machine learning model. We\ntransform that multidimensional input data by compressing it, through the\nalgorithms we discuss in this paper, into a lower-dimensional space. The\nresult is a set of vectors in an embedding space

In [30]:
rag_retriever.retrieve("Position-wise Feed-Forward Networks")

Retrieving documents for query: 'Position-wise Feed-Forward Networks'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.80it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_3261b91a_16',
  'content': 'position in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in sequence-to-sequence models such as\n[31, 2, 8].\n• The encoder contains self-attention layers. In a self-attention layer all of the keys, values\nand queries come from the same place, in this case, the output of the previous layer in the\nencoder. Each position in the encoder can attend to all positions in the previous layer of the\nencoder.\n• Similarly, self-attention layers in the decoder allow each position in the decoder to attend to\nall positions in the decoder up to and including that position. We need to prevent leftward\ninformation ﬂow in the decoder to preserve the auto-regressive property. We implement this\ninside of scaled dot-product attention by masking out (setting to −∞) all values in the input\nof the softmax which correspond to illegal connections. See Figure 2.\n3.3\nPosition-wise Fee

Integration VectorDB Context Pipeline with LLM Output

In [None]:
# Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv

groq_api_key = os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.3-70b-versatile",temperature=0.1,max_tokens=1024)

def rag_simple(query,retriever,llm,top_k=3):
    results = retriever.retrieve(query,top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question"
    
    # generate answer using Groq LLM
    prompt=f"""Use the following context to answer the question concisely
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response = llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [39]:
answer = rag_simple("Explain Position-wise Feed-Forward Networks",rag_retriever,llm,top_k=3)
print(answer)

Retrieving documents for query: 'Explain Position-wise Feed-Forward Networks'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 28.36it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Position-wise Feed-Forward Networks are not explicitly explained in the given context, but it is mentioned in section 3.3. However, based on the Transformer paper, Position-wise Feed-Forward Networks typically refer to a fully connected feed-forward neural network applied to each position in the sequence separately and identically. This consists of two linear layers with a ReLU activation function in between.


Enhanced RAG Pipeline Features

In [40]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("Hard Negative Mining Technqiues", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'Hard Negative Mining Technqiues'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.41it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





Answer: Selecting negative examples with higher confidence scores to improve model training, focusing on hardest-to-classify negatives.
Sources: [{'source': 'object_detection.pdf', 'page': 7, 'score': 0.18127715587615967, 'preview': 'Hard Negative Mining\nImbalance between positive and negative examples.\nUse negative examples with higher confidence score.\nNon Maximum Suppression\nIf output boxes overlap, only consider the most confident....'}]
Confidence: 0.18127715587615967
Context Preview: Hard Negative Mining
Imbalance between positive and negative examples.
Use negative examples with higher confidence score.
Non Maximum Suppression
If output boxes overlap, only consider the most confident.


In [44]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("How embeddings are created ?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'How embeddings are created ?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 32.32it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: Embeddings are created by passing raw input data into a machine learning model, which transforms the multidimensional input data into a lower-dimensional space through algorithms, resulting in a set of vectors in an embedding space.
Sources: [{'source': 'embeddings.pdf', 'page': 5, 'score': 0.42968595027923584, 'preview': 'What do embeddings actually look like? Here is one single embedding,\nalso called a vector, in three dimensions. We can think of this as a repre-\nsentation of a single element in our dataset. For example, this hypothetical\nembedding represents a single word "fly", in three dimensions. Generally, we\nr...'}, {'source': 'embeddings.pdf', 'page': 4, 'score': 0.26875442266464233, 'preview': 'Figure 2: Embeddings papers in arXiv by month. It’s interesting to note the decline in\nfrequency of embeddings-specific papers, possibly in tandem with the rise of deep learning\narchitectures like GPT source\nBuilding and expanding on the concepts in Word2Vec, the Transfo

In [42]:
# --- Advanced RAG Pipeline: Streaming, Citations, History, Summarization ---
from typing import List, Dict, Any
import time

class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # Store query history

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        # Retrieve relevant documents
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:300] + '...'
            } for doc in results]
            # Streaming answer simulation
            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        # Add citations to answer
        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("what is object detection ?", top_k=3, min_score=0.1, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])

Retrieving documents for query: 'what is object detection ?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.47it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Streaming answer:
Use the following context to answer the question concisely.
Context:
What  is ob

ject detection
http://cs231n.stanford.edu/slides/winter1516_lecture8.pdf

Object Detection
Sihao Liang
Jiajun Lu
Kevin Perkins

Overview
Intro
Part I: Two Stage




 Detection
Part II: Unified Detection
Part III: Others
Summary and comparison

Question: what is object detection ?

Answer:

Final Answer: Object detection is the process of locating and classifying objects within an image or video.

Citations:
[1] object_detection.pdf (page 2)
[2] object_detection.pdf (page 0)
[3] object_detection.pdf (page 1)
Summary: Object detection is a process used to identify and locate objects within visual data, such as images or videos. This process involves both locating the objects and classifying them into specific categories, allowing for a deeper understanding of the visual content.
History: {'question': 'what is object detection ?', 'answer': 'Object detection is the process of locating and classifying objects within an image or video.', 'sources': [{'source': 'object_detection.pdf', 'page': 2, 'score': 0.7389722466468811, 'preview': 'What  is object detection\nhttp://cs231n.stanford.edu/slides/winter1516_lecture8.pdf...'}, {'source': 'object_detection