In [1]:
%pwd

'c:\\Users\\MadhanBalaSukumar\\OneDrive - Simply Regulation FinregE\\Finrege\\My Projects\\End-to-End-Medical-Chatbot---Gen-AI\\research'

In [2]:
import os 

os.chdir('../')

In [3]:
%pwd

'c:\\Users\\MadhanBalaSukumar\\OneDrive - Simply Regulation FinregE\\Finrege\\My Projects\\End-to-End-Medical-Chatbot---Gen-AI'

# Data Loading and Chunking

In [4]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters  import RecursiveCharacterTextSplitter  

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#Extract Data from the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob = '*.pdf',
        loader_cls=PyPDFLoader
    )

    documents = loader.load()

    return documents



In [6]:
extracted_data = load_pdf_file(data='Data/')

In [8]:
#split the Data into Text Chunks

def text_splits(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks  = text_splits(extracted_data)
print('Length of Text Chunks', len (text_chunks))

Length of Text Chunks 5859


# Vector Embedding

In [7]:
#downloading embedding model from hugging face

from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [8]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs = {'device': 'cpu'})
    return embeddings

In [9]:
embeddings = download_hugging_face_embeddings()

# Vector Storage

In [10]:
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
#Pinecone
from pinecone import Pinecone, ServerlessSpec

In [12]:
#to read an environment variable
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


In [13]:
index_name = "medicalbot"

In [None]:

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = index_name

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )

In [14]:
import os 

#If the PINECONE_API_KEY variable does not already exist in the environment, it creates it. to set or update an environment variable
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY 
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY 


# Building Semantic Index

In [11]:
#Embed Each chunk and upsert the embeddings into Pincone Index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents= text_chunks, #chunked text documents
    index_name = index_name, #index name in Pinecone
    embedding= embeddings #embeddings
)

NameError: name 'text_chunks' is not defined

In [15]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [16]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [17]:
#test
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='3d89fea1-80ed-44f2-89a3-5fb31209144b', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='f20599af-683f-4db9-a5f1-3a6df4f15585', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM 

# Connecting LLM to Knowledge Base

# **Simple ChatBot**

In [18]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o", temperature=0.4, max_tokens=500)

In [29]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [156]:
#Defining a system and human prompt template.
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)



In [None]:

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [148]:
# Create a chain for passing a list of Documents to a model.
# used when need to combine multiple pieces of information (documents) and process them as a whole.
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)

In [149]:
question_answer_chain


RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an Medical assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\n{context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000023E5DED6DD0>, async_client=<openai.resources.chat.

In [None]:
#Creating the full retrieval chain. combines a retriever (which fetches the relevant documents) and LLM
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response)

{'input': 'what is Acne?', 'context': [Document(id='3d89fea1-80ed-44f2-89a3-5fb31209144b', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'), Document(id='f20599af-683f-4db9-a5f1-3a6df4f15585', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, 

In [81]:
print(response['input'])
print(response['answer'])

what is Acne?
Acne is a common skin disease characterized by pimples on the face, chest, and back, occurring when skin pores become clogged with oil, dead skin cells, and bacteria. It is also known as acne vulgaris and is the most common skin disease, affecting nearly 17 million people in the United States.


In [35]:
response = rag_chain.invoke({"input": "what is the remedy for Acne?"})
print(response["answer"])

Remedies for acne include maintaining a well-balanced diet, avoiding trigger foods, reducing stress, and not picking or squeezing blemishes. It's also recommended to shampoo often, wear hair off the face, and give dry pimples limited sun exposure unless advised otherwise. Additionally, certain supplements like essential fatty acids, vitamin B complex, zinc, vitamin A, and chromium, as well as herbal remedies like milk thistle, cnidium seed, and honeysuckle flower, may be beneficial.


In [63]:
response = rag_chain.invoke({"input": "what are you?"})
print(response["answer"])

I am a medical assistant designed to answer questions and provide information based on available data and context.


# **Conversational retrieval chatbot**

For the Conversational retrieval chain, we have to get the retriever fetch documents relevant not only to the user input but also to the chat history. Therefore, the retriever needs to have a query based not only on the user input but also on the relevant documents from the chat history. In order to do this, we provide the LLM with the chat history and user input and ask it to derive a search query for the retriever to fetch the relevant data from the vector store.

In [42]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder # The MessagesPlaceholder is part of the prompt along with user input and user message.
 

In [158]:
chat_history = []

In [159]:

# The prompt contains the user input, the chat history, and a message to generate a search query.

prompt_search_query = ChatPromptTemplate.from_messages([
MessagesPlaceholder(variable_name="chat_history"),
("user","{input}"),
("user","Given the above conversation, generate a search query to look up to get information relevant to the conversation")
])

# Retriever Chain

The create_history_aware_retriever creates a chain that does the following steps:

* Sends a prompt to the LLM with the chat_history and user input to generate a search query for the retriever
* The retriever uses the search query to obtain the relevant documents from the vector store.
* So, the inputs to the create_history_aware_retriever are the llm, retriever and the prompt.
* The retriever_chain will return the retrieved documents from the vector store that are relevant to the user input and chat history.

In [75]:
from langchain_classic.chains import create_history_aware_retriever

In [160]:
retriever_chain = create_history_aware_retriever(chatModel, retriever, prompt_search_query)


In [161]:
retriever_chain

RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
| VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x0000023E5D3B83A0>, search_kwargs={'k': 3}))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai

# Prompt To Get Response From LLM Based on Chat History

to send the retrieved documents from the vector store along with a prompt to the llm to get the response to the user input.

We create a prompt containing the context (retrieved documents from vector store), chat history and the user input.

In [162]:
prompt_get_answer = ChatPromptTemplate.from_messages([
("system", system_prompt ),
MessagesPlaceholder(variable_name="chat_history"),
("user","{input}"),
])

# Document Chain

now have a retriever_chain that retrieves the relevant data from vector store, and document_chain that sends the chat_history, relevant data and user input to the llm.

In [163]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(chatModel,prompt_get_answer)

# Conversational Retrieval Chain

So, in the final step, we combine retriever_chain and document_chain using create_retrieval_chain to create a Conversational retrieval chain

In [164]:
from langchain_classic.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

# Invoking the Chain

In [None]:
from langchain_core.messages import HumanMessage, AIMessage

In [117]:
chat_history = [  
    ("system", "You are a helpful and knowledgeable medical assistant chatbot. You provide reliable information related to health and medical conditions.")
]

In [183]:
user_input  = 'what was my second question i asked?'

In [184]:

response = retrieval_chain.invoke({
"chat_history":chat_history,
"input":user_input
})

if response:
    user_message = HumanMessage(content=user_input)
    assistant_message = AIMessage(content=response['answer'])
    chat_history.append(user_message)
    chat_history.append(assistant_message)
    print (response['answer'])

Your second question was, "what is acne?"


In [185]:
print(response['input'])
print(response['answer'])

what was my second question i asked?
Your second question was, "what is acne?"


In [186]:
chat_history

[HumanMessage(content='what are you?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='I am a medical assistant AI designed to help answer health-related questions using available information.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what is acne?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Acne is a common skin disease characterized by pimples on the face, chest, and back, occurring when the pores of the skin become clogged with oil, dead skin cells, and bacteria. It is also known as acne vulgaris and is the most common skin disease, affecting nearly 17 million people in the United States.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='does acupressure and malignant are same?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='No, acupressure and malignant are not the same. Acupressure is a form of touch therapy using finger pressure on specific body points, while "malignant" ref