# HyDE

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

#LLM

In [4]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192", max_tokens=1000)

#Vector DB

In [5]:
def replace_t_with_space(list_of_documents):
    """
        Replace all the tab ('\t') keys with white space in the page content of list of documents.

        Args:
            list_of_documents: A list of document obj, each with 'page_content' attribute.
        Return:
            The modified list of documents with tab characters replaced by white spaces
    """
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', " ")
    return list_of_documents

In [36]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
#from helper_functions import Helpers

class Data_Ingestion_Pipe:
    """
    A pipeline that showcases the ingestion of documet data into vectorstore
    """
    def __init__(self, file_path: str = r"D:\My Files\RAG-Techniques\RAG.pdf"):
        self.file_path = file_path
        #self.helper_func = Helpers()
        #self.embed_provider = Embedding_Provider()

    
    async def encode_pdf(self, chunk_size: int =1000, chunk_overlap: int = 200):
        """
        Set of setps to stores the pdf documents in vectorestore in the form of embeddings
        Args:
            file_path: denotes the location of the file
            chunk_size : denote the size of each chunk the document to be split into
            chunk_overlap: connecting words in each chunk

        Return:
            A FAISS vector store containing the encoded book content.
        """
        #loads the pdf file
        try:
            loader = PyPDFLoader(self.file_path)
            docs = await loader.aload()
        except FileNotFoundError as e:
            raise f"Error occured: {e}"
        # split the doc file into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size, chunk_overlap = chunk_overlap
        )
        doc_chunks = text_splitter.split_documents(documents=docs)

        cleaned_texts = replace_t_with_space(doc_chunks)
        #embeddings
        embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        #vector db
        faiss_vstore = await FAISS.afrom_documents(cleaned_texts, embedding=embedding)
        return faiss_vstore

In [37]:
import nest_asyncio
nest_asyncio.apply()

import asyncio

In [43]:
async def doc_retriever(chunk_size=500, chunk_overlap=100):
    """
    retrieves top k similar documents
    """
    obj = Data_Ingestion_Pipe()
    vstore = await obj.encode_pdf(chunk_size=500, chunk_overlap=100)
    #retriever = await vstore.aas_retriever(search_kwargs={"k": 3})
    return vstore

#HyDE Retriever

In [50]:
from langchain_core.prompts import PromptTemplate

class HyDERetriever:
    def __init__(self, chunk_size=500, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.llm = llm  
        self.vectorestore = asyncio.run(doc_retriever())

        self.hyde_prompt = PromptTemplate(
            input_variables=["query", "chunk_size"],
            template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the document size has be exactly {chunk_size} characters.""",
        )
        self.hyde_chain = self.hyde_prompt | self.llm
    
    async def generate_hypothetical_document(self, query):
        """Generates a HyDE from query"""
        input_variables = {"query": query, "chunk_size": self.chunk_size}
        return await self.hyde_chain.ainvoke(input_variables)
    
    async def retriever(self, query, k=3):
        """Retrieves context form Vector DB using HyDE documents"""
        hypothetical_docs = asyncio.run(self.generate_hypothetical_document(query)).content
        similar_docs = await self.vectorestore.asimilarity_search(hypothetical_docs, k=k)
        return similar_docs, hypothetical_docs

In [51]:
retriever = HyDERetriever()

In [52]:
test_query = "What is HyDE in RAG?"
results, hypothetical_doc = asyncio.run(retriever.retriever(test_query))

In [53]:
docs_content = [doc.page_content for doc in results]

print("hypothetical_doc:\n")
print((hypothetical_doc)+"\n")
print(docs_content)

hypothetical_doc:

**HyDE in RAG: A Comprehensive Overview**

HyDE, short for Hydrological Data Extraction, is a Raster Application Framework (RAG) tool designed to extract and process large-scale hydrological data from gridded datasets. Developed by the European Space Agency (ESA), HyDE enables users to retrieve and analyze hydrological parameters such as evapotranspiration, soil moisture, and runoff at a high spatial and temporal resolution.

**Key Features:**

1. **Data Retrieval:** HyDE allows users to download and process gridded hydrological data from various sources, including the ESA's Climate Change Initiative (CCI) and the Copernicus Climate Data Store (CDS).
2. **Data Processing:** The tool performs complex calculations, such as downscaling, aggregating, and interpolating data to create high-resolution hydrological products.
3. **Customization:** Users can tailor their output by selecting specific data products, temporal resolutions, and spatial extent.
4. **Visualization:**