In [7]:

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [23]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [26]:
extracted_data=load_pdf_file(data='D:\Medical_ChatBot\Data')

  extracted_data=load_pdf_file(data='D:\Medical_ChatBot\Data')


In [27]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [28]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 7024


In [29]:
from langchain.embeddings import HuggingFaceEmbeddings

In [30]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [32]:

embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [33]:

query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [65]:

from dotenv import load_dotenv
load_dotenv()

True

In [55]:
index_name="medicalbot"

In [56]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [57]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [58]:

docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x26a8711fec0>

In [59]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [60]:
retrieved_docs = retriever.invoke("What is Acne?")

In [61]:

retrieved_docs

[Document(id='b63d3656-f3fb-4ccd-ac64-8c776d0253b6', metadata={'creationdate': '2004-12-18T17:16:32-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:35:04-06:00', 'page': 425.0, 'page_label': '426', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'D:\\Medical_ChatBot\\Data\\data.pdf', 'total_pages': 759.0}, page_content='Corticosteriod —A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(id='4e827735-6558-435a-bb27-1275d0e639be', metadata={'creationdate': '2004-12-18T17:16:32-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:35:04-06:00', 'page': 298.0, 'page_label

In [75]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyBuEOqQ-a-JOo9HNlsJvYAP4ho_eh2g2qY"

In [76]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")

In [77]:

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [78]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [79]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

I am sorry, but this document does not contain information on acromegaly and gigantism.


In [80]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])

Statistics are used by doctors to predict disease outcomes and recovery likelihood.  Five-year survival rates are a common statistical measure in cancer.  These rates compare the survival of cancer patients to a similar, cancer-free population.


In [82]:
response = rag_chain.invoke({"input": "What is Diet?"})
print(response["answer"])

A diet is a pattern of eating habits, often modified for various reasons such as weight loss, disease management, or religious observance.  People may alter their diets to improve physical and mental health, or for ethical reasons.  Diets aim to provide necessary nutrients while avoiding deficiencies or excesses.


In [92]:
response = rag_chain.invoke({"input": "What is  Carcinoembryonic antigen test?"})
print(response["answer"])

The carcinoembryonic antigen (CEA) test is a blood test that measures the level of CEA, a protein sometimes produced by cancer cells.  While it's associated with colon cancer, it can also be produced by other cancers or not produced by some colon cancers, limiting its usefulness for screening.  It can be helpful in monitoring treatment for certain cancers.
