In [1]:
import json
import google.generativeai as genai
import chromadb
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
from dotenv import load_dotenv
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

class HospitalRAGProcessor:
    def __init__(self, api_key: str, db_path: str):
        self.api_key = api_key
        self.db_path = db_path
        self.embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=self.api_key,
            task_type="retrieval_document"
        )
        
    def prepare_documents(self, data: List[dict]) -> List[Document]:
        """Convert raw data into LangChain Document objects."""
        documents = []
        
        for doc in data:
            if not isinstance(doc, dict):
                print(f"Invalid document format: {doc}")
                continue
                
            # Convert keywords list to string if present
            keywords = doc.get("metadata", {}).get("keywords", [])
            if isinstance(keywords, list):
                keywords = ", ".join(keywords)
                
            metadata = {
                "title": doc.get("title", ""),
                "category": doc.get("category", ""),
                "keywords": keywords,
                "id": doc.get("id", "")
            }
            
            # Create LangChain Document object
            if content := doc.get("content"):
                documents.append(
                    Document(
                        page_content=content,
                        metadata=metadata
                    )
                )
            else:
                print(f"Document {doc.get('id')} has no content. Skipping...")
                
        return documents
    
    def split_documents(self, documents: List[Document]) -> List[Document]:
        """Split documents into smaller chunks if needed."""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len,
        )
        return text_splitter.split_documents(documents)
    
    def create_vector_store(self, documents: List[Document], collection_name: str):
        """Create and persist the vector store."""
        vector_store = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            persist_directory=os.path.join(self.db_path, collection_name),
            collection_name=collection_name
        )
        vector_store.persist()
        return vector_store
    
    def process_and_store_documents(self, data: List[dict], collection_name: str):
        """Main method to process and store documents."""
        # Prepare documents
        documents = self.prepare_documents(data)
        
        # Split documents if needed
        split_docs = self.split_documents(documents)
        
        # Create and persist vector store
        vector_store = self.create_vector_store(split_docs, collection_name)
        
        return vector_store



In [3]:
def create_database():
    load_dotenv()
    
    # Load data
    with open('data_set.json', 'r') as file:
        data = json.load(file)
        
    
    # Initialize processor
    processor = HospitalRAGProcessor(
        api_key=os.getenv('API_KEY'),
        db_path="./database"
    )
    
    # Process and store documents
    vector_store = processor.process_and_store_documents(
        data=data,
        collection_name="hospital_documents_langchain"
    )
    
    print("Vector store created successfully!")
    return vector_store



In [4]:
create_database()

Vector store created successfully!


  vector_store.persist()


<langchain_community.vectorstores.chroma.Chroma at 0x16b77567c50>

In [5]:
import os
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory,RunnableLambda
from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [14]:
api_key= os.getenv('API_KEY')
db_path="./database"
collection_name="hospital_documents_langchain"

llm = GoogleGenerativeAI(
            model="gemini-1.5-flash",
            google_api_key=api_key,
            temperature=0.3
        )

embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=api_key,
            task_type="retrieval_document"
        )

retriever = Chroma(
            persist_directory= os.path.join(db_path, collection_name),
            embedding_function=embeddings,
            collection_name=collection_name,
        ).as_retriever(search_kwargs={"k": 2})

In [15]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


In [16]:
### Answer question ###
qa_system_prompt = """You are a friendly customer service agent working for Horizon Hospitals Lanka PLC. 
            Your goal is to assist with any questions using the most relevant and up-to-date information provided in the context below. 
            When responding, ensure you:
            

            - Keep your tone warm, professional, and helpful, just as a caring hospital representative would.
            - Provide detailed and accurate answers, incorporating only relevant data from the context.
            - If the information doesn't directly address the question, acknowledge that politely and offer a general response if appropriate.
            - Avoid making up answers if the data does not apply. It's better to admit that the information is not available than to provide inaccurate information and mention to contact hospital via phone.
            - Make sure to greet in first message. After that it is not nessasary to greet again.
            Context: {context}

            Based on the context, craft a thoughtful, precise, and helpful response:
            """

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)



In [17]:
### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [18]:
conversational_rag_chain.invoke(
    {"input": "Hello, I want to make an appointment with the doctor. Can you help me?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'Hello!  Certainly, I can help you with that. To schedule an appointment with a doctor, you can visit the hospital website and go to the "Channel Doctor" section. You can also call the reception at 011 1234567. The available appointment slots are listed in the "Channel Doctor" section on the website. \n'

## Evaluation model

In [21]:
import pandas as pd

# Define the question-answer pairs
data = {
    'Question': [
        "What are the available heart-related services in your hospital?",
        "What are the room facilities available?",
        "How can I make an appointment?",
        "Can you give me the steps to make the appointment via website?",
        "How to cancel and reschedule an appointment?",
        "Where is your hospital located?"
    ],
    'Answer': [
        "Our Heart Centre offers a wide range of cardiac services including angiograms, stenting procedures, device closure, pacemaker placements, cardiac resynchronization therapy, coronary artery bypass graft surgery (CABG), valve repairs, and congenital heart disease correction surgeries. We also have a state-of-the-art catheterization laboratory and facilities like a coronary care unit (CCU), intensive care units, and expert nursing care.",
        "Our hospital provides various room options to suit different needs, including private rooms with en-suite bathrooms, semi-private rooms, and modern amenities like comfortable furnishings and entertainment facilities. The focus is on privacy, hygiene, and patient well-being in all rooms.",
        "To schedule an appointment, you can visit the hospital website’s doctor section or call the reception at 011 1234567. Available slots are listed in the doctor section.",
        "To schedule an appointment via the website, enter your full name, email, and phone number in the respective fields. Then, select the department and doctor from dropdowns. Choose a preferred date and time for the appointment, provide a reason for the visit, and click Submit to confirm. You’ll receive a confirmation email afterward.",
        "To cancel an appointment, click the 'Cancel Appointment' link in your confirmation email. For rescheduling, click the 'Reschedule Appointment' link in the email and select a new date and time, then submit the form. You’ll receive an updated confirmation email.",
        "Our hospital is located at Horizon Health PLC, N0.15, Park Road, Colombo 05, Sri Lanka. You can contact us via telephone at 011 1234567 or email us at info@HorizonHealth.com."
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,Question,Answer
0,What are the available heart-related services ...,Our Heart Centre offers a wide range of cardia...
1,What are the room facilities available?,Our hospital provides various room options to ...
2,How can I make an appointment?,"To schedule an appointment, you can visit the ..."
3,Can you give me the steps to make the appointm...,"To schedule an appointment via the website, en..."
4,How to cancel and reschedule an appointment?,"To cancel an appointment, click the 'Cancel Ap..."
5,Where is your hospital located?,"Our hospital is located at Horizon Health PLC,..."


In [24]:
questions = data["Question"] 
ground_truth = data["Answer"]  


print(questions)
print(ground_truth)

['What are the available heart-related services in your hospital?', 'What are the room facilities available?', 'How can I make an appointment?', 'Can you give me the steps to make the appointment via website?', 'How to cancel and reschedule an appointment?', 'Where is your hospital located?']
['Our Heart Centre offers a wide range of cardiac services including angiograms, stenting procedures, device closure, pacemaker placements, cardiac resynchronization therapy, coronary artery bypass graft surgery (CABG), valve repairs, and congenital heart disease correction surgeries. We also have a state-of-the-art catheterization laboratory and facilities like a coronary care unit (CCU), intensive care units, and expert nursing care.', 'Our hospital provides various room options to suit different needs, including private rooms with en-suite bathrooms, semi-private rooms, and modern amenities like comfortable furnishings and entertainment facilities. The focus is on privacy, hygiene, and patient 

In [25]:
data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

In [34]:
for query in questions:
    data["question"].append(query)
    
    # Get the response from conversational RAG
    result = conversational_rag_chain.invoke(
        {"input": query},
        config={"configurable": {"session_id": "abc123"}}
    )
    
    # Collect answer data
    answer = result.get("answer", "")
    data["answer"].append(answer)
    
    # Collect context data
    contexts = retriever.get_relevant_documents(query)

    data["contexts"].append(contexts)

   

: 

In [33]:
data

{'question': ['What are the available heart-related services in your hospital?',
  'What are the available heart-related services in your hospital?',
  'What are the available heart-related services in your hospital?',
  'What are the room facilities available?',
  'How can I make an appointment?',
  'Can you give me the steps to make the appointment via website?',
  'How to cancel and reschedule an appointment?',
  'Where is your hospital located?',
  'What are the available heart-related services in your hospital?',
  'What are the room facilities available?',
  'How can I make an appointment?',
  'Can you give me the steps to make the appointment via website?',
  'How to cancel and reschedule an appointment?',
  'Where is your hospital located?',
  'What are the available heart-related services in your hospital?',
  'What are the room facilities available?',
  'How can I make an appointment?',
  'Can you give me the steps to make the appointment via website?',
  'How to cancel and r