In [2]:
from langchain.tools.retriever import create_retriever_tool
from langchain_core.retrievers import BaseRetriever
from pydantic import BaseModel, Field
import os
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

PERSIST_DIRECTORY = "VectorDBs\\BAAIbgeLargeEn3BooksVectorDB"


In [10]:
class LoggingRetriever(BaseRetriever):
    """
    Custom retriever that logs retrieved documents and filters duplicates.

    Attributes:
        base_retriever: The base retriever to delegate retrieval to
        seen_hashes: A set to track already seen document hashes
    """
    base_retriever: BaseRetriever = Field(...)
    seen_hashes: set = Field(default_factory=set)

    def _get_relevant_documents(self, query, *, run_manager=None):
        """
        Retrieve and log relevant documents for a given query.

        Args:
            query: User input query
            run_manager: Optional run manager for logging

        Returns:
            List of unique and relevant documents
        """
        docs = self.base_retriever._get_relevant_documents(query, run_manager=run_manager)
        unique_docs = []
        docs_with_metadata = []
        for doc in docs:
            # Create a unique hash using content and metadata
            doc_hash = hash(f"{doc.page_content}-{doc.metadata}")
            if doc_hash not in self.seen_hashes:
                self.seen_hashes.add(doc_hash)
                unique_docs.append(doc)

                source = os.path.basename(doc.metadata.get('source', 'unknown'))
                page = doc.metadata.get('page', 'unknown')
                docs_with_metadata.append({"doc": doc, "source": source, "page": page})

                # print(f"Retrieved: {source} p.{page} - {doc.page_content[:50]}...")

        global finalContext 
        finalContext = docs_with_metadata
        print(f"{finalContext=}")
        return unique_docs

class RAGAgent:
    """
    RAG (Retrieval-Augmented Generation) Agent class that handles query processing.
    """
    
    def __init__(self, verbose: bool = True, numOfContext=3):
        """
        Initialize the agent and set up its components.

        Args:
            verbose: Whether to print debug information
            numOfContext: Number of documents to retrieve for context
        """
        self.verbose = verbose
        self.numOfContext = numOfContext
        self.context = []
        self.context_failure = 0 

        self._load_vector_db()
        self._setup_retriever()

    def _load_vector_db(self):
        """Load vector database from disk."""
        if self.verbose:
            print(f"Loading vector database from {PERSIST_DIRECTORY}")
        
        embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

        self.vectorstore = Chroma(
            collection_name="rag-chroma",
            embedding_function=embedding_function,
            persist_directory=PERSIST_DIRECTORY
        )

        if self.verbose:
            print("Vector database loaded successfully")

    def _setup_retriever(self):
        """Configure retriever with Maximal Marginal Relevance and logging."""
        retriever = self.vectorstore.as_retriever(
            search_type="mmr",
            search_kwargs={
                "k": self.numOfContext,
                "fetch_k": 20,
                "lambda_mult": 0.8
            }
        )

        self.logging_retriever = LoggingRetriever(base_retriever=retriever)

        self.retriever_tool = create_retriever_tool(
            self.logging_retriever,
            "retrieve_relevant_section",
            "Search and return information from the documents"
        )

        self.tools = [self.retriever_tool]

rag_agent = RAGAgent()

Loading vector database from VectorDBs\BAAIbgeLargeEn3BooksVectorDB
Vector database loaded successfully


In [11]:
rag_agent.logging_retriever._get_relevant_documents("feature importance")

finalContext=[{'doc': Document(id='d5086f00-d3ae-44cc-81c5-1833e419ab19', metadata={'creationdate': '2016-07-22T19:38:38+05:30', 'creator': 'Adobe InDesign CS6 (Windows)', 'moddate': '2016-07-23T18:53:00+05:30', 'page': 17, 'page_label': 'vii', 'producer': 'Adobe PDF Library 10.0.1', 'source': '3books\\ADVANCED_MACHINE_LEARNING_WITH_PYTHON.pdf', 'total_pages': 278, 'trapped': '/False'}, page_content="Preface\n[  vii ]\nAt times, this book won't be able to give a subject the attention that it deserves. \nWe cover a lot of ground in this book and the pace is fairly brisk as a result! At \nthe end of each chapter, I refer you to further reading, in a book or online article, \nso that you can build a broader base of relevant knowledge. I'd suggest that it's \nworth doing additional reading around any unfamiliar concept that comes up as \nyou work through this book, as machine learning knowledge tends to tie together \nsynergistically; the more you have, the more readily you'll understand n

[Document(id='d5086f00-d3ae-44cc-81c5-1833e419ab19', metadata={'creationdate': '2016-07-22T19:38:38+05:30', 'creator': 'Adobe InDesign CS6 (Windows)', 'moddate': '2016-07-23T18:53:00+05:30', 'page': 17, 'page_label': 'vii', 'producer': 'Adobe PDF Library 10.0.1', 'source': '3books\\ADVANCED_MACHINE_LEARNING_WITH_PYTHON.pdf', 'total_pages': 278, 'trapped': '/False'}, page_content="Preface\n[  vii ]\nAt times, this book won't be able to give a subject the attention that it deserves. \nWe cover a lot of ground in this book and the pace is fairly brisk as a result! At \nthe end of each chapter, I refer you to further reading, in a book or online article, \nso that you can build a broader base of relevant knowledge. I'd suggest that it's \nworth doing additional reading around any unfamiliar concept that comes up as \nyou work through this book, as machine learning knowledge tends to tie together \nsynergistically; the more you have, the more readily you'll understand new concepts \nas you 

In [13]:
client = rag_agent.vectorstore._collection

results  = client.get(include=['metadatas'])

sources = set()

for metadata in results['metadatas']:
    if metadata and 'source' in metadata:
        source = metadata['source']
        sources.add(source)

for source in sources:
    print(source)

3books\Fundamentals of Deep Learning, 2nd Edition.pdf
3books\ADVANCED_MACHINE_LEARNING_WITH_PYTHON.pdf
3books\AI_and_Machine_Learning_for_Coders_A_Programmers_Guide_to_Artificial.pdf
