In [11]:
import os 
import chromadb 
from langchain.vectorstores import Chroma 
from langchain.document_transformers import LongContextReorder 
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings 
from langchain.retrievers.merger_retriever import MergerRetriever 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import  RecursiveCharacterTextSplitter

LOTR (Lord of the Retriever), also known as the Merger Retriever. This intriguing technique utilizes a round-robin approach to merge results from multiple vector databases, ensuring a robust and diverse set of results.
Long Context Reorder: This is all about the reranking of retrievers. Once you've retrieved your documents using multiple models, how do you optimally order them to ensure relevance and precision?
For those dabbling with Retrieval Augmented Generation (RAG), implementing these techniques is pivotal. A more effective retrieval process directly enhances the quality and relevance of the generated content in RAG models

In [3]:

model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

In [4]:
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [13]:
#Data Preprocessing
loader_un_sdg = PyPDFLoader("data/UN SDG.pdf")
documents_un_sdg = loader_un_sdg.load()
text_splitter_un_sdg = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
text_un_sdg = text_splitter_un_sdg.split_documents(documents_un_sdg)



In [14]:
loader_paris_agreement = PyPDFLoader("data/english_paris_agreement.pdf")
documents_paris_agreement = loader_paris_agreement.load()
text_splitter_paris_agreement = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                   chunk_overlap=100)
texts_paris_agreement = text_splitter_paris_agreement.split_documents(documents_paris_agreement)

In [16]:
un_sdg_store = Chroma.from_documents(text_un_sdg, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/un_sdg_chroma_cosine")

In [17]:
paris_agreement_store = Chroma.from_documents(texts_paris_agreement, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/paris_chroma_cosine")

In [18]:
load_un_sdg_store = Chroma(persist_directory="store/un_sdg_chroma_cosine", embedding_function=hf)
print("First Vector Store Loaded.........")

First Vector Store Loaded.........


In [19]:
load_paris_agreement_store = Chroma(persist_directory="store/paris_chroma_cosine", embedding_function=hf)
print("Second Vector Store Loaded........")

Second Vector Store Loaded........


# Init Merger Retriver

In [23]:
retriever_un_sdg = load_un_sdg_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

retriever_paris_agreement = load_paris_agreement_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [24]:

lotr = MergerRetriever(retrievers=[retriever_un_sdg, retriever_paris_agreement])

In [25]:

for chunks in lotr.get_relevant_documents("Is there any framework available to tackle the climate change?"):
    print(chunks.page_content)

the+ commitment+ undertaken+ by+ developedJcountry+ parties+ to+ the+ United+Nations+Framework+Convention+on+Climate+Change+to+a+goal+of+mobilizing+jointly+$100+billion+annually+by+2020+from+all+sources+to+address+the+needs+of+developing+countries+in+the+context+of+meaningful+mitigation+actions+and+transparency+on+implementation+and+fully+operationalize+the+Green+Climate+Fund+through+its+capitalization+as+soon+as+possible+++ *+Acknowledging+that+the+United+Nations+Framework+Convention+on+Climate+Change+is+the+primary+international,+intergovernmental+forum+for+negotiating+the+global+response+to+climate+change.+
finance should  represent a progression beyond previous efforts. 
4. The provision of scaled-up financial resources should aim to achieve a 
balance between adaptation and mitigation, taking into account country-driven 
strategies, and the priorities and needs of developing country Parties, especially 
those that are particularly vulnerable to the adverse effects of climate chang

In [27]:
query = "Is there any framework available to tackle the climate change?"
docs = lotr.get_relevant_documents(query)
docs

[Document(metadata={'page': 26, 'source': 'data/UN SDG.pdf'}, page_content='the+ commitment+ undertaken+ by+ developedJcountry+ parties+ to+ the+ United+Nations+Framework+Convention+on+Climate+Change+to+a+goal+of+mobilizing+jointly+$100+billion+annually+by+2020+from+all+sources+to+address+the+needs+of+developing+countries+in+the+context+of+meaningful+mitigation+actions+and+transparency+on+implementation+and+fully+operationalize+the+Green+Climate+Fund+through+its+capitalization+as+soon+as+possible+++ *+Acknowledging+that+the+United+Nations+Framework+Convention+on+Climate+Change+is+the+primary+international,+intergovernmental+forum+for+negotiating+the+global+response+to+climate+change.+'),
 Document(metadata={'page': 14, 'source': 'data/english_paris_agreement.pdf'}, page_content='finance should  represent a progression beyond previous efforts. \n4. The provision of scaled-up financial resources should aim to achieve a \nbalance between adaptation and mitigation, taking into account coun

# Reordered Docs (Tackling Lost in the Middle)

https://python.langchain.com/docs/integrations/retrievers/merger_retriever/

In [28]:
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)


In [29]:
reordered_docs

[Document(metadata={'page': 14, 'source': 'data/english_paris_agreement.pdf'}, page_content='finance should  represent a progression beyond previous efforts. \n4. The provision of scaled-up financial resources should aim to achieve a \nbalance between adaptation and mitigation, taking into account country-driven \nstrategies, and the priorities and needs of developing country Parties, especially \nthose that are particularly vulnerable to the adverse effects of climate change and \nhave significant capacity constraints, such as the least developed countries and \nsmall island developing States, considering the need for public and grant-based \nresources for adaptation. \n5. Developed country Parties shall biennially communicate indicative \nquantitative and qualitative information related to paragraphs 1 and 3 of this \nArticle, as applicable, including, as available, projected levels of public financial \nresources to be provided to developing country Parties. Other Parties providing 