<a href="https://colab.research.google.com/github/HarshSonaiya/RAGs/blob/main/HyDE_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install qdrant_client
!pip install sentence-transformers
!pip install langchain langchain_community
!pip install fastembed
from fastembed import SparseTextEmbedding
from qdrant_client import QdrantClient, models
from typing import List
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client.http.models import SparseVector
from tqdm import tqdm
from langchain.schema import Document



In [5]:
client = QdrantClient(url="https://ba2a3306-6fa2-4e59-b44c-ba41ca6d0844.europe-west3-0.gcp.cloud.qdrant.io/",api_key="QKadpncThByWzafBM2pJGJdArqoCoIeq-I9yggJHjuU3XRk1i6RVhg")
DENSE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
SPARSE_MODEL = "Qdrant/bm42-all-minilm-l6-v2-attentions"
dense_embedding_model = SentenceTransformer(DENSE_MODEL)
sparse_embedding_model = SparseTextEmbedding(SPARSE_MODEL)



Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
if not client.collection_exists(collection_name="HyDE"):
        client.create_collection(
            collection_name="HyDE",
            vectors_config={
                'dense': models.VectorParams(
                    size=384,
                    distance=models.Distance.COSINE,
                )
            },
            sparse_vectors_config= {
                "sparse": models.SparseVectorParams(),
            }
        )

In [9]:
!pip install pypdf
loader = PyPDFLoader("/content/annual-report-2019-2020.pdf")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)


Collecting pypdf
  Downloading pypdf-5.0.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-5.0.0-py3-none-any.whl (292 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/292.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.0.0


In [10]:
def create_dense_vector(docs: Document, model: SentenceTransformer) :
    """
    Encode a list of Document objects using a HuggingFace model.

    Args:
        docs (Document): A Document object with 'page_content'.
        model (SentenceTransformer): An instance of SentenceTransformer.

    Returns:
        List[float]: A list of embeddings, one for each document.
    """
    # Extract page content from documents
    embeddings = [model.encode(docs.page_content)]

    return embeddings[0].tolist()


In [11]:
def create_sparse_vector(docs: Document, sparse_text_embedding_model) -> SparseVector:
    """
    Create a sparse vector from the text using BM42 approach.

    Args:
        docs (Document): A Document object with 'page_content'.
        sparse_text_embedding_model: An instance of SparseTextEmbedding.

    Returns:
        SparseVector: A Qdrant SparseVector object.
    """
    embeddings = list(sparse_text_embedding_model.embed([docs.page_content]))[0]

    if hasattr(embeddings, 'indices') and hasattr(embeddings, 'values'):
        return SparseVector(
            indices=embeddings.indices.tolist(),
            values=embeddings.values.tolist()
        )
    else:
        raise ValueError("The embeddings object does not have 'indices' and 'values' attributes.")


In [16]:
    for i, doc in enumerate(tqdm(chunks, total=len(chunks))):
        dense_embedding = create_dense_vector(doc, dense_embedding_model)
        sparse_embedding = create_sparse_vector(doc, sparse_embedding_model)

        client.upsert(
            collection_name="HyDE",
            points=[models.PointStruct(
                id = i ,
                vector = {
                    "dense": dense_embedding,
                    "sparse": sparse_embedding
                },
                payload={
                    "content": doc.page_content,
                    "metadata": doc.metadata
                }
            )]
        )

100%|██████████| 78/78 [00:18<00:00,  4.15it/s]


In [40]:
from langchain.prompts.prompt import PromptTemplate
from langchain import LLMChain

def create_llm_chain(llm):

    template = """You are an AI assistant for answering questions about the various documents from the user.
        You are given the following extracted parts of a long document and a question.
        If you don't know the answer, just say "Hmm, I'm not sure. And copy all the question as it is" Don't try to make up an answer.
        Question: {question}
        =========
        {context}
        =========
        Answer in Markdown:"""

    prompt = PromptTemplate(template=template, input_variables=["question","context"])
    return LLMChain(llm=llm, prompt=prompt)


In [23]:
!pip install langchain_groq
groq_api_key = "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"
from langchain_groq import ChatGroq



In [35]:
# Function to perform HyDE-based retrieval using ChatGroq
def hyde_retrieve(hypo_doc):
    # Obtain the embeddings for the generated hypothetical document
    dense_query = list(dense_embedding_model.encode(hypo_doc))
    sparse_query = list(sparse_embedding_model.embed(hypo_doc))[0]

    sparse_query = models.SparseVector(
        indices= sparse_query.indices.tolist(),
        values=sparse_query.values.tolist()
    )

    results = client.query_points(
        collection_name="HyDE",
        prefetch= [
            models.Prefetch(
                query = sparse_query,
                using = "sparse",
                limit = 5
            ),
            models.Prefetch(
                query =dense_query,
                using = "dense",
                limit = 5
            )
        ],
        query= models.FusionQuery(fusion=models.Fusion.RRF)
    )
    documents = [point for point in results.points]

    # Retrieve the corresponding documents from the corpus based on the search results
    return documents


In [42]:
# Example query
query = "How many employees were there in the company in 2020?"
llm = ChatGroq(temperature=0.3, model_name="llama3-8b-8192", api_key=groq_api_key)
llm_chain = create_llm_chain(llm)

# Invoke the model with the provided query to generate the document
response = llm_chain.invoke({"question": query, "context":""})['text']

retrieved_docs = hyde_retrieve(response)
combined_context = "\n".join([doc.payload.get("content", "") for doc in retrieved_docs])

# response = llm_chain.invoke({"question": query, "context":combined_context})['text']

print("Retrieved Documents:", combined_context)

Retrieved Documents: well as TCS. 
The company offers a variety of beneﬁts to full time 
employees including parental leave12. In FY 2020, a total 
of 8,331 employees availed of parental leave. Of these, 92 
were men and 8,239 were women. Of the 4,693 employees 
whose parental leave ended during the year, 92 were 
men and 4,601 were women. Of these, 89 men and 4,502 women employees rejoined work, amounting to a retention 
rate of 97% and 98% respectively.
At TCS, three months’ notice is required from either side
reassured employees that the company values them for 
the contextual knowledge they possess, and is prepared to 
invest in equipping them with new-age technology skills 
that they do not have. This has made TCS the employer 
of choice, and its employee retention record an industry 
benchmark. In FY 2020, TCS’ IT services attrition rate was 
12.1%.
to induction training, and mandatory annual refreshers, a 
variety of employee engagement activities were conducted 
round the year.