In [2]:
from langchain.document_loaders import PyPDFLoader
import os 
import chromadb 
from langchain.vectorstores import Chroma 
from langchain.document_transformers import LongContextReorder 
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings 
from langchain.retrievers.merger_retriever import MergerRetriever 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import  RecursiveCharacterTextSplitter

In [3]:

model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [5]:
#Data Preprocessing
loader_un_sdg = PyPDFLoader("data/UN SDG.pdf")
documents_un_sdg = loader_un_sdg.load()
text_splitter_un_sdg = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
text_un_sdg = text_splitter_un_sdg.split_documents(documents_un_sdg)

loader_paris_agreement = PyPDFLoader("data/english_paris_agreement.pdf")
documents_paris_agreement = loader_paris_agreement.load()
text_splitter_paris_agreement = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                   chunk_overlap=100)
texts_paris_agreement = text_splitter_paris_agreement.split_documents(documents_paris_agreement)

In [6]:
un_sdg_store = Chroma.from_documents(text_un_sdg, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/un_sdg_chroma_cosine")

paris_agreement_store = Chroma.from_documents(texts_paris_agreement, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/paris_chroma_cosine")

In [None]:
load_un_sdg_store = Chroma(persist_directory="store/un_sdg_chroma_cosine", embedding_function=hf)
print("First Vector Store Loaded.........")

In [None]:
load_paris_agreement_store = Chroma(persist_directory="store/paris_chroma_cosine", embedding_function=hf)
print("Second Vector Store Loaded........")

In [None]:
retriever_un_sdg = load_un_sdg_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

retriever_paris_agreement = load_paris_agreement_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [None]:

lotr = MergerRetriever(retrievers=[retriever_un_sdg, retriever_paris_agreement])

In [None]:

for chunks in lotr.get_relevant_documents("Is there any framework available to tackle the climate change?"):
    print(chunks.page_content)

In [None]:
query = "Is there any framework available to tackle the climate change?"
docs = lotr.get_relevant_documents(query)
docs


See this result is too much messy now lets refine it according to the question and overcome the situation of lost in middle
Now After understanding step by step it create a pipeline for LLM

In [7]:
from langchain.document_transformers import (
    EmbeddingsClusteringFilter,
    EmbeddingsRedundantFilter,
)
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_transformers import LongContextReorder

In [None]:
from re import search
filter = EmbeddingsRedundantFilter(embeddings=hf)
reordering = LongContextReorder()
pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])
compression_retriever_reordered = ContextualCompressionRetriever(
    base_compressor=pipeline, base_retriever=lotr,search_kwargs={"k": 3, "include_metadata": True}
)


In [None]:

!pip install llama-cpp-python

In [None]:
from langchain.llms import LlamaCpp
llms = LlamaCpp(streaming=True,
                   model_path="/content/drive/MyDrive/zephyr-7b-beta.Q4_K_M.gguf",
                   max_tokens = 1500,
                   temperature=0.75,
                   top_p=1,
                   gpu_layers=0,
                   stream=True,
                   verbose=True,n_threads = int(os.cpu_count()/2),
                   n_ctx=4096)

In [None]:

from langchain.chains import RetrievalQA
     

qa = RetrievalQA.from_chain_type(
      llm=llms,
      chain_type="stuff",
      retriever = compression_retriever_reordered,
      return_source_documents = True
)
     

In [None]:

query ="who is jon snow?"
results = qa(query)
print(results['result'])
#
print(results["source_documents"])