In [None]:
import os
import tempfile

import chromadb
import ollama
from langchain.retrievers import EnsembleRetriever 
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain.chains import create_retrieval_chain
from sentence_transformers import CrossEncoder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.retrievers import BM25Retriever
import warnings
from pydantic import BaseModel
from langchain.schema import BaseRetriever

In [7]:
warnings.filterwarnings("ignore")

#### Model

In [8]:
model = ChatOllama(model ="llama3.2")

## Data Acquisition amd preprocessing

#### Load and parse a PDF file using ‘PyMuPDF’ library

In [9]:
pdf_path = "22067317_Hridaya_Giri_Proposal.pdf"
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

In [10]:
print(docs)

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-12-04T10:26:31+05:45', 'source': '22067317_Hridaya_Giri_Proposal.pdf', 'file_path': '22067317_Hridaya_Giri_Proposal.pdf', 'total_pages': 35, 'format': 'PDF 1.7', 'title': '', 'author': 'Hridaya Giri', 'subject': '', 'keywords': '', 'moddate': '2024-12-04T10:26:31+05:45', 'trapped': '', 'modDate': "D:20241204102631+05'45'", 'creationDate': "D:20241204102631+05'45'", 'page': 0}, page_content='Module Code & Module Title \nCS6P05NI Final Year Project \nAssessment Weightage & Type \n5% FYP Proposal \nSemester: 5 \n2024 Autumn \nPROJECT TITLE: “KnowYourHair”- Hair Fall Prediction and Prevention System \nStudent Name: Hridaya Giri \nLondon Met ID:22067317 \nCollege ID: np01ai4a220034@islingtoncollege.edu.np \nInternal Supervisor: Rubin Thapa \nExternal Supervisor: Subhash Basnet \nAssignment Due Date: 12/04/2024 \nAssignment Submission Date: 12/04/2024 \nW

#### Splitting the document into different chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "?", "!", " ", ""],
    )

In [12]:
splits = text_splitter.split_documents(docs)
print(splits)

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-12-04T10:26:31+05:45', 'source': '22067317_Hridaya_Giri_Proposal.pdf', 'file_path': '22067317_Hridaya_Giri_Proposal.pdf', 'total_pages': 35, 'format': 'PDF 1.7', 'title': '', 'author': 'Hridaya Giri', 'subject': '', 'keywords': '', 'moddate': '2024-12-04T10:26:31+05:45', 'trapped': '', 'modDate': "D:20241204102631+05'45'", 'creationDate': "D:20241204102631+05'45'", 'page': 0}, page_content='Module Code & Module Title \nCS6P05NI Final Year Project \nAssessment Weightage & Type \n5% FYP Proposal \nSemester: 5 \n2024 Autumn \nPROJECT TITLE: “KnowYourHair”- Hair Fall Prediction and Prevention System \nStudent Name: Hridaya Giri \nLondon Met ID:22067317 \nCollege ID: np01ai4a220034@islingtoncollege.edu.np \nInternal Supervisor: Rubin Thapa \nExternal Supervisor: Subhash Basnet'), Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creato

## Embedding + Vector Store 

In [13]:
ollama_embeddings = OllamaEmbeddings(model = "nomic-embed-text:latest")
embeded_chunks = ollama_embeddings.embed_documents(docs)

In [14]:
vector_store = Chroma(embedding_function=ollama_embeddings)
vector_store.add_documents(documents=splits)

['9c5196c8-bafc-4ef0-af4e-adf1623cfe2a',
 '5a3a8fc8-9d9f-442c-902b-6020535666e9',
 '8194f6b7-3c17-4b12-8e77-0c2e1705ccc6',
 '5796ed4b-bfe6-44b3-ad98-5769e27ca898',
 '725cb3ea-1c36-4b78-a2e2-a8b76e77b4bb',
 'cad976d4-e901-4721-9427-5fc297c2a256',
 '91ebd326-c199-47ce-ab76-2d4605c72b1e',
 '2900410a-1071-4777-a75b-3ced822a54f1',
 'a3065fdc-d18d-4c7d-b155-3182bb7d3eed',
 '827420a9-e01c-484f-8cfa-41bd1c93cd1a',
 '1a9c0753-a17c-40a1-8658-afdc5f9b36e2',
 '1856bb16-5266-448a-b199-57a4dc66bd40',
 'ee4bf995-86ae-4ea6-bda2-71f585a359bc',
 '30d66dc8-e514-487f-baae-ee291bd37af6',
 '61c69110-ad28-4b37-9390-c95a14cbb75f',
 '86e6a2a5-5c9f-4ca3-b879-4f16d33be35b',
 '4aa5905e-3642-4068-bb70-55fe2301787d',
 '257e164f-2c14-4d0a-84e7-fcbe7e8f6eee',
 '85c1273b-9479-4950-835d-58c5e082d11c',
 '89d1398c-c4c3-40e5-b99e-d102ed263d11',
 '483db65c-79cd-4279-8949-f4a97a9ce5e1',
 'f59e4860-990f-4602-b62f-d5834f43d161',
 'ae63ca1f-e052-43ef-bba1-01cef287021e',
 '2b66d115-1a25-422c-8afc-d7da3022ac86',
 '7c289985-0309-

#### Semantic similarity search in the vector store 

In [15]:
# similarity score threshold over similarity to retrive only the best and based on the best semantic search output similarity score more than 0.5 and upto 5

semantic_retreive = vector_store.as_retriever(search_type = "similarity",search_kwargs ={"k":5})

#### BM25 Retrieval for general keyword searching

In [16]:
BM25_retrive = BM25Retriever.from_documents(splits)
BM25_retrive.k = 5

#### Combining two retrievers as one and more priority to similarity score based retreiver 

In [17]:
combined_retriever = EnsembleRetriever(retrievers=[semantic_retreive,BM25_retrive], weights =[0.7,0.3])

In [18]:
combined_retriever.get_relevant_documents("scrum methodology")

[Document(id='5e7ea6fb-90c4-4e88-ba24-d197f791367e', metadata={'total_pages': 35, 'modDate': "D:20241204102631+05'45'", 'trapped': '', 'format': 'PDF 1.7', 'subject': '', 'file_path': '22067317_Hridaya_Giri_Proposal.pdf', 'moddate': '2024-12-04T10:26:31+05:45', 'creator': 'Microsoft® Word for Microsoft 365', 'page': 33, 'source': '22067317_Hridaya_Giri_Proposal.pdf', 'creationdate': '2024-12-04T10:26:31+05:45', 'creationDate': "D:20241204102631+05'45'", 'producer': 'Microsoft® Word for Microsoft 365', 'keywords': '', 'title': '', 'author': 'Hridaya Giri'}, page_content='CS6P05NI \n \nFinal Year Project \n27 \n22067317 Hridaya Giri \n \nFigure 8: Incremental Methodology \n13.6. Selected methodology \nThe main reasons to choose Scrum methodology as the go to methodology for \n“KnowYourHair” \n1. Iterative Development for Complex Features  \n1. The iterative approach to development divides the project into small sprints,'),
 Document(id='fb392de4-7508-4d6d-8756-395d0afb6932', metadata={'c

### Multi-stage hybrid retrieval pipeline using BM25, VectorRetrieval and CrossEncoderReranker

#### Cross Encoder ReRanker manually

In [58]:
cross_encoder  = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def query_doc(query, docs, top_k = 10):
    query_doc_pair = [(query, doc.page_content)for doc in docs]
    result_cross_encodere = cross_encoder.predict(query_doc_pair)

    #zipping result and docs togther and sorting in descending order
    sorted_docs = [doc for _, doc in sorted(zip(result_cross_encodere, docs), reverse=True)]

    return sorted_docs[:top_k]



#### cross encoder reranking using basemodel and baseretriever

In [59]:
class CrossEncoderRerankingRetriever(BaseRetriever, BaseModel):
    base_retriever: BaseRetriever
    reranker: callable

    def get_relevant_documents(self, query, top_k=10):
        
        retrieved_docs = self.base_retriever.get_relevant_documents(query, top_k=20)  

        reranked_docs = self.reranker(query, retrieved_docs, top_k=top_k)
        
        return reranked_docs

In [60]:
cross_encoder_retriever = CrossEncoderRerankingRetriever(
    base_retriever=combined_retriever,
    reranker=query_doc
)

In [65]:
cross_encoder_retriever.get_relevant_documents("hello",top_k = 10)

[Document(id='483db65c-79cd-4279-8949-f4a97a9ce5e1', metadata={'title': '', 'modDate': "D:20241204102631+05'45'", 'page': 7, 'subject': '', 'source': '22067317_Hridaya_Giri_Proposal.pdf', 'creationDate': "D:20241204102631+05'45'", 'keywords': '', 'total_pages': 35, 'author': 'Hridaya Giri', 'trapped': '', 'creationdate': '2024-12-04T10:26:31+05:45', 'format': 'PDF 1.7', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'moddate': '2024-12-04T10:26:31+05:45', 'file_path': '22067317_Hridaya_Giri_Proposal.pdf'}, page_content='those between 40 and 49 experience it (T Rhodes, 1998). For more click \nTo address this, "KnowYourHair" is a web application designed to predict an \nindividual’s future risk of hair loss based on key inputs such as family history, lifestyle \nhabits, nutrition, and other additional factors. The output includes a personalized'),
 Document(id='d9cbd500-a1ea-4568-8d50-a3bd1a9a5914', metadata={'format': 'PDF 1.7', 'moddate

In [66]:
system_prompt = """
You are an AI assistant tasked with providing detailed answers based solely on the given context. Your goal is to analyze the information provided and formulate a comprehensive, well-structured response to the question.

context will be passed as "Context:"
user question will be passed as "Question:"

To answer the question:
1. Thoroughly analyze the context, identifying key information relevant to the question.
2. Organize your thoughts and plan your response to ensure a logical flow of information.
3. Formulate a detailed answer that directly addresses the question, using only the information provided in the context.
4. Ensure your answer is comprehensive, covering all relevant aspects found in the context.
5. If the context doesn't contain sufficient information to fully answer the question, state this clearly in your response.

Format your response as follows:
1. Use clear, concise language.
2. Organize your answer into paragraphs for readability.
3. Use bullet points or numbered lists where appropriate to break down complex information.
4. If relevant, include any headings or subheadings to structure your response.
5. Ensure proper grammar, punctuation, and spelling throughout your answer.

Important: Base your entire response solely on the information provided in the context. Do not include any external knowledge or assumptions not present in the given text.
"""

In [None]:
def callModel(context, prompt):
    response = ollama.chat(

        model = model,
        stream = True,
        messages = [

            {"role" : "system", "content" : system_prompt},
            {"role" : "user", "content" : f"Context{context}, Question{prompt}",},
        ]
    )
    for chunk in response:
        if chunk["done"] == False:
            yield chunk["messages"]["content"]
        else:
            break
    


<generator object callModel at 0x0000023A2088B0A0>
