In [1]:
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_classic.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
my_api = os.getenv("my_key")
Pinecone_API_KEY = os.getenv("Pinecone_API_KEY")

In [3]:
pc = Pinecone(api_key=Pinecone_API_KEY)
index = pc.Index("embedding")

In [4]:
# Now Extract data from PDF:
def load_data(data_path):
    loader = DirectoryLoader(data_path, 
                            glob='*.pdf',
                            loader_cls=PyPDFLoader
                           )
    
    documents = loader.load()
    return documents

In [5]:
data = load_data(r"C:\Data Sceience\PROJECTS\Medical-ChatBot-using-langchain\data")

In [6]:
len(data)

637

In [7]:
def text_spliter(text):
    text_split = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True,
    )
    text_chunks = text_split.split_documents(text)
    return text_chunks

In [8]:
text_chunks = text_spliter(data) 

In [9]:
len(text_chunks)

3426

In [10]:
print(f"page_content: { text_chunks[0].page_content}")
print(f"metadata: {text_chunks[0].metadata}")

page_content: The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION
metadata: {'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'C:\\Data Sceience\\PROJECTS\\Medical-ChatBot-using-langchain\\data\\Medical_book.pdf', 'total_pages': 637, 'page': 1, 'page_label': '2', 'start_index': 0}


In [11]:
# Now we will create embeddings for our pinecone index
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004",api_key=my_api)

In [12]:
chunks = []
for i,doc in enumerate(text_chunks):
    chunk_data = {
        "index":i,
        "text":doc.page_content
    }
    chunks.append(chunk_data)

In [13]:
len(chunks)

3426

In [14]:
# 1. Prepare a clean list of just the text strings
# (The model cannot read the dictionary {"id": 1, "text": ...}, it needs strings)
text_list = [chunk["text"] for chunk in chunks]

# 2. Embed them all at once (Batching)
# This counts as 1 request instead of 131
vectors = embeddings.embed_documents(text_list)

# 3. Pair them back up with their IDs for Pinecone
data_to_upload = []
for i, vector in enumerate(vectors):
    data_to_upload.append({
        "id": str(chunks[i]["index"]), 
        "values": vector, 
        "metadata": {"text": chunks[i]["text"]}
    })

# 4. Upload to Pinecone
# index.upsert(vectors=data_to_upload)

# print("‚úÖ Success! Uploaded without hitting limits.")

In [15]:
import time

# 4. Upload to Pinecone in batches
batch_size = 100 
total_chunks = len(data_to_upload)

print(f"üöÄ Starting upload of {total_chunks} chunks to Pinecone...")

for i in range(0, total_chunks, batch_size):
    # Slice the list to get a batch of 100
    batch = data_to_upload[i : i + batch_size]
    
    try:
        # Upsert the batch
        index.upsert(vectors=batch)
        print(f"‚úÖ Uploaded: {min(i + batch_size, total_chunks)} / {total_chunks}")
        
    except Exception as e:
        print(f"‚ùå Error at batch {i}: {e}")
        # Optional: brief pause before retrying
        time.sleep(1) 
        index.upsert(vectors=batch)

print("üéâ Success! All 3,426 chunks are now indexed.")

üöÄ Starting upload of 3426 chunks to Pinecone...
‚úÖ Uploaded: 100 / 3426
‚úÖ Uploaded: 200 / 3426
‚úÖ Uploaded: 300 / 3426
‚úÖ Uploaded: 400 / 3426
‚úÖ Uploaded: 500 / 3426
‚úÖ Uploaded: 600 / 3426
‚úÖ Uploaded: 700 / 3426
‚úÖ Uploaded: 800 / 3426
‚úÖ Uploaded: 900 / 3426
‚úÖ Uploaded: 1000 / 3426
‚úÖ Uploaded: 1100 / 3426
‚úÖ Uploaded: 1200 / 3426
‚úÖ Uploaded: 1300 / 3426
‚úÖ Uploaded: 1400 / 3426
‚úÖ Uploaded: 1500 / 3426
‚úÖ Uploaded: 1600 / 3426
‚úÖ Uploaded: 1700 / 3426
‚úÖ Uploaded: 1800 / 3426
‚úÖ Uploaded: 1900 / 3426
‚úÖ Uploaded: 2000 / 3426
‚úÖ Uploaded: 2100 / 3426
‚úÖ Uploaded: 2200 / 3426
‚úÖ Uploaded: 2300 / 3426
‚úÖ Uploaded: 2400 / 3426
‚úÖ Uploaded: 2500 / 3426
‚úÖ Uploaded: 2600 / 3426
‚úÖ Uploaded: 2700 / 3426
‚úÖ Uploaded: 2800 / 3426
‚úÖ Uploaded: 2900 / 3426
‚úÖ Uploaded: 3000 / 3426
‚úÖ Uploaded: 3100 / 3426
‚úÖ Uploaded: 3200 / 3426
‚úÖ Uploaded: 3300 / 3426
‚úÖ Uploaded: 3400 / 3426
‚úÖ Uploaded: 3426 / 3426
üéâ Success! All 3,426 chunks are now indexed.


In [16]:
vector_store = PineconeVectorStore(embedding=embeddings, index=index)

In [21]:
query = "what are allergies?"


In [22]:
ans = vector_store.similarity_search(query)

In [24]:
print(ans[0].page_content)

Description
Allergies are among the most common of medical
disorders. It is estimated that 60 million Americans, or
more than one in every five people, suffer from some
form of allergy, with similar proportions throughout
much of the rest of the world. Allergy is the single largest
reason for school absence and is a major source of lost
productivity in the workplace.
An allergy is a type of immune reaction. Normally,
the immune system responds to foreign microorganisms
or particles by producing specific proteins called anti-
bodies. These antibodies are capable of binding to iden-
tifying molecules, or antigens, on the foreign particle.
This reaction between antibody and antigen sets off a
series of chemical reactions designed to protect the
body from infection. Sometimes, this same series of
reactions is triggered by harmless, everyday substances
such as pollen, dust, and animal danders. When this
occurs, an allergy develops against the offending sub-
stance (an allergen.)


In [None]:
prompt_template = '''
You are a helpfull medical assistant, that know all about medical and health.
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Keep the answer as concise as possible and only provide to the point
answer donot explanation of the answer (allways provide a short and meaning full answer).

Context:{context}

Question: {input}  

Only return the helpfull answer nothing else.
Helpful Answer:
'''

In [37]:
prompt = PromptTemplate(template=prompt_template,input_variables=["context","question"])

# Now Create a Retriever Chain :


In [38]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

In [39]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}    # Fetch the top 3 most relevant chunks
)

In [40]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash"
                            ,google_api_key = my_api,
                            temperature=0.5)

In [41]:
docs_chain = create_stuff_documents_chain(llm,prompt)

retriever_chain = create_retrieval_chain(retriever,docs_chain)

In [42]:
query = "what are allergies?"

result = retriever_chain.invoke({"input": query})

In [43]:
print(result["answer"])

Helpful Answer: An allergy is a type of immune reaction triggered by harmless, everyday substances.


In [45]:
result 

{'input': 'what are allergies?',
 'context': [Document(id='658', metadata={}, page_content='Description\nAllergies are among the most common of medical\ndisorders. It is estimated that 60 million Americans, or\nmore than one in every five people, suffer from some\nform of allergy, with similar proportions throughout\nmuch of the rest of the world. Allergy is the single largest\nreason for school absence and is a major source of lost\nproductivity in the workplace.\nAn allergy is a type of immune reaction. Normally,\nthe immune system responds to foreign microorganisms\nor particles by producing specific proteins called anti-\nbodies. These antibodies are capable of binding to iden-\ntifying molecules, or antigens, on the foreign particle.\nThis reaction between antibody and antigen sets off a\nseries of chemical reactions designed to protect the\nbody from infection. Sometimes, this same series of\nreactions is triggered by harmless, everyday substances\nsuch as pollen, dust, and anima