
##### This notebook implements a Retrieval-Augmented Generation (RAG) pipeline following four key steps: **indexing**, where documents are preprocessed and stored for efficient search; **retrieval**, using a hybrid approach combining vector embeddings and BM25 to fetch relevant documents; **augmentation**, which reranks retrieved documents with a Cross-Encoder model to improve relevance and builds the context; and finally, **generation**, where a language model generates answers grounded on the augmented context. This modular design enables accurate, context-aware responses for question answering tasks.


In [85]:
import os
import tempfile

import chromadb
import ollama
from langchain.retrievers import EnsembleRetriever 
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain.chains import create_retrieval_chain
from sentence_transformers import CrossEncoder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.retrievers import BM25Retriever
import warnings
from pydantic import BaseModel
from langchain.schema import BaseRetriever


In [86]:
warnings.filterwarnings("ignore")

#### Model

In [87]:
model = ChatOllama(model ="llama3.2")

## Data Acquisition amd preprocessing

#### Load and parse a PDF file using ‘PyMuPDF’ library

In [88]:
pdf_path = "22067317_Hridaya_Giri_Proposal.pdf"
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

#### Splitting the document into different chunks

In [89]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "?", "!", " ", ""],
    )

In [90]:
splits = text_splitter.split_documents(docs)

## Embedding + Vector Store 

In [91]:
ollama_embeddings = OllamaEmbeddings(model = "nomic-embed-text:latest")
embeded_chunks = ollama_embeddings.embed_documents(docs)

In [92]:
vector_store = Chroma(embedding_function=ollama_embeddings)
vector_store.add_documents(documents=splits)

['c8fdd5d3-a00b-4391-9537-51e5196710f0',
 '3033a081-b4ce-4b2d-85a6-c22c8db65893',
 '2a7ddf16-8fda-4628-a80b-a3d119ba5901',
 '1fe77444-dd16-41d1-bf79-3f4d3296f750',
 'f1f2c136-730b-4824-9774-b90c98f9b785',
 '3ac50f62-d8f5-4287-9b90-28d930d73b04',
 'c8864da6-f9ca-4cf3-ba5d-487c5abfb38d',
 'cfe43851-2125-47af-82f8-b99e01795201',
 '9684e547-82aa-44fe-b9a1-e8fd06a16055',
 '9f8b6e14-d8db-43d2-8528-d13fae77291b',
 'db37b701-2bd6-4899-90e8-42c99e5677e6',
 '1ab127ea-ebfc-4538-921d-03a678d876a9',
 'e5f18b49-a276-4fa3-920a-0d541ee1f8cf',
 'c71cfbb6-2cf3-45b0-88bb-1ebbfdddffd3',
 '0b6f7446-0373-41c1-a2a5-e7183da01e65',
 '477ffc9f-5237-4361-8239-8123ea81173e',
 'f927b26b-56b5-46eb-aded-48bff04389f5',
 '3607a38b-9f6c-4c78-a11b-6e4ea02a8a20',
 'e4e05ea1-8d9a-459e-91ad-0c4778240dec',
 'd2816e81-fb5c-425f-b4ec-6138e37e6ee5',
 'b9054d0a-4d9e-4d80-a639-24fe69a1dad5',
 '0f15c5e8-060c-4901-82b0-2a498f4367a3',
 '82ebcfef-eb78-494e-9aaa-24b84d00912f',
 'c42e9462-68ab-4549-af2c-82f5de9a5ad5',
 '7a07eba7-ce7e-

#### Semantic similarity search in the vector store 

In [93]:
# similarity score threshold over similarity to retrive only the best and based on the best semantic search output similarity score more than 0.5 and upto 5

semantic_retreive = vector_store.as_retriever(search_type = "similarity",search_kwargs ={"k":5})

#### BM25 Retrieval for general keyword searching

In [94]:
BM25_retrive = BM25Retriever.from_documents(splits)
BM25_retrive.k = 5

#### Combining two retrievers as one and more priority to similarity score based retreiver 

In [95]:
combined_retriever = EnsembleRetriever(retrievers=[semantic_retreive,BM25_retrive], weights =[0.7,0.3])

In [96]:
combined_retriever.get_relevant_documents("scrum methodology")

[Document(id='fb392de4-7508-4d6d-8756-395d0afb6932', metadata={'trapped': '', 'moddate': '2024-12-04T10:26:31+05:45', 'producer': 'Microsoft® Word for Microsoft 365', 'source': '22067317_Hridaya_Giri_Proposal.pdf', 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': '22067317_Hridaya_Giri_Proposal.pdf', 'page': 15, 'modDate': "D:20241204102631+05'45'", 'author': 'Hridaya Giri', 'total_pages': 35, 'title': '', 'creationDate': "D:20241204102631+05'45'", 'creationdate': '2024-12-04T10:26:31+05:45', 'subject': '', 'format': 'PDF 1.7', 'keywords': ''}, page_content='Scrum stands out as one of the most popular and well-defined methodologies under \nAgile.  \nIn contrast to other methodologies like waterfall, which follow a step-by-step \nhierarchical approach, SCRUM methodology uses sprints, which are goals that are \nset out over several weeks or months. SCRUM is carried out in various sprints,'),
 Document(id='e3e5ee45-93b4-400c-a35d-51816352a859', metadata={'page': 15, 'title': '

### Multi-stage hybrid retrieval pipeline using BM25, VectorRetrieval and CrossEncoderReranker

#### Cross Encoder ReRanker manually

In [97]:
cross_encoder  = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def query_doc(query, docs, top_k = 10):
    query_doc_pair = [(query, doc.page_content)for doc in docs]
    result_cross_encodere = cross_encoder.predict(query_doc_pair)

    #zipping result and docs togther and sorting in descending order
    sorted_docs = [doc for _, doc in sorted(zip(result_cross_encodere, docs), reverse=True)]

    return sorted_docs[:top_k]



#### cross encoder reranking using basemodel and baseretriever

In [98]:
class CrossEncoderRerankingRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    reranker: callable

    def get_relevant_documents(self, query, top_k=10):
        
        retrieved_docs = self.base_retriever.get_relevant_documents(query, top_k=20)  

        reranked_docs = self.reranker(query, retrieved_docs, top_k=top_k)
        
        return reranked_docs

In [99]:
cross_encoder_retriever = CrossEncoderRerankingRetriever(
    base_retriever=combined_retriever,
    reranker=query_doc
)

In [100]:
cross_encoder_retriever.get_relevant_documents("hello",top_k = 3)

[Document(id='2b66d115-1a25-422c-8afc-d7da3022ac86', metadata={'subject': '', 'author': 'Hridaya Giri', 'moddate': '2024-12-04T10:26:31+05:45', 'source': '22067317_Hridaya_Giri_Proposal.pdf', 'total_pages': 35, 'format': 'PDF 1.7', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'page': 7, 'file_path': '22067317_Hridaya_Giri_Proposal.pdf', 'producer': 'Microsoft® Word for Microsoft 365', 'title': '', 'creationDate': "D:20241204102631+05'45'", 'creationdate': '2024-12-04T10:26:31+05:45', 'modDate': "D:20241204102631+05'45'", 'trapped': ''}, page_content='clinics directly through the web application, which provides complete information \nabout the clinics and their services. Additionally, "KnowYourHair" also includes a list \nof hair care products and medications, each with detailed descriptions which the \nuser can purchase for themselves. The web application also offers instructional'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Mic

#### System prompt for context-based, precise answers.

In [101]:
system_prompt = """
You are an AI assistant tasked with providing detailed answers based solely on the given context. Your goal is to analyze the information provided and formulate a comprehensive, well-structured response to the question.

context will be passed as "Context:"
user question will be passed as "Question:"

To answer the question:
1. Thoroughly analyze the context, identifying key information relevant to the question.
2. Organize your thoughts and plan your response to ensure a logical flow of information.
3. Formulate a detailed answer that directly addresses the question, using only the information provided in the context.
4. Ensure your answer is comprehensive, covering all relevant aspects found in the context.
5. If the context doesn't contain sufficient information to fully answer the question, state this clearly in your response.

Format your response as follows:
1. Use clear, concise language.
2. Organize your answer into paragraphs for readability.
3. Use bullet points or numbered lists where appropriate to break down complex information.
4. If relevant, include any headings or subheadings to structure your response.
5. Ensure proper grammar, punctuation, and spelling throughout your answer.

Important: Base your entire response solely on the information provided in the context. Do not include any external knowledge or assumptions not present in the given text.
"""

#### Streaming context-aware answers from LLaMA 3.2 using Ollama API with system and user prompts.


In [102]:

def callModel(context, prompt):
    
    response = ollama.chat(

        model = "llama3.2",
        stream = True,
        messages = [

            {"role" : "system", "content" : system_prompt},
            {"role" : "user", "content" : f"Context{context}, Question{prompt}",},
        ]
    )
    for chunk in response:
        if chunk["done"] == False:
            yield chunk["message"]["content"]
        else:
            break
    


#### Stream answer from the model

In [117]:
def answer_query(query):
    top_docs = cross_encoder_retriever.get_relevant_documents(query, top_k=10)
    
    context = "\n".join([doc.page_content for doc in top_docs])
    
    print(f"\n📩 Question: {query}")
    print("💬 Answer:", end=" ", flush=True)
    for chunk in callModel(context, query):
        print(chunk, end="", flush=True) 
    print("\n")  


print("🤖 Chatbot is running. Type 'quit' or 'exit' to end the session.\n")
while True:
    user_input = input("You: ")
    if user_input.lower() in ["quit", "exit"]:
        print("👋 Chatbot session ended.")
        break
    answer_query(user_input)


🤖 Chatbot is running. Type 'quit' or 'exit' to end the session.


📩 Question: what is KnowYourHair ?
💬 Answer: Question: What is KnowYourHair?

1. **Overview of KnowYourHair**: KnowYourHair is a web application designed to predict an individual's future risk of hair loss based on key inputs such as family history, lifestyle habits, nutrition, and other additional factors.

2. **Key Features**:
    * Provides personalized predictions for hair loss risk
    * Offers complete information about clinics and their services
    * Integrates with hair care products and clinics directly through the web application
    * Includes a list of hair care products and medications with detailed descriptions
    * Offers instructional waterfall cascades in 5 stages, from Requirements to Maintenance

3. **Waterfall Methodology Stages**:
    * **Requirements**: Specifies project requirements, determines project scope, and gathers necessary requirements.
        + Involves market research, stakeholder and 