In [2]:
# Install dependencies
!pip install langchain langchain-community chromadb sentence-transformers google-generativeai pypdf gTTS SpeechRecognition pydub

# Imports
import google.generativeai as genai
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader
from gtts import gTTS
from IPython.display import Audio, display
import speech_recognition as sr
from google.colab import files

# 🔹 Setup Gemini API (replace with your API key)
genai.configure(api_key="AIzaSyCUKNZaIJ4xTwHm-hq66qm7jYg3xfC2Y18")

# === Step 1: Upload a PDF Knowledge Base ===
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print("📄 PDF uploaded:", pdf_path)

loader = PyPDFLoader(pdf_path)
docs = loader.load()

# === Step 2: Split PDF into Chunks ===
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = splitter.split_documents(docs)

# === Step 3: Build Vector Database ===
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(splits, embedding=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# === Step 4: Chat Memory ===
chat_history = []

# === Step 5: Gemini Answer Generator ===
def gemini_generate(prompt):
    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text

# === Step 6: Full Chatbot Flow with Citations ===
def chatbot(query, use_speech=False):
    global chat_history

    # 1. Input Handling
    if use_speech:
        recognizer = sr.Recognizer()
        with sr.Microphone() as source:
            print("🎤 Speak now...")
            audio = recognizer.listen(source)
        try:
            query = recognizer.recognize_google(audio)
            print("🗣️ You said:", query)
        except:
            return "⚠️ Could not recognize speech"

    # 2. Maintain Chat Memory
    chat_history.append({"role": "user", "content": query})

    # 3. Retrieval-Augmented Generation
    retrieved_docs = retriever.get_relevant_documents(query)
    context = "\n".join([f"[Source {i+1}] {d.page_content}" for i, d in enumerate(retrieved_docs)])

    # 4. Construct Prompt with Memory
    history_text = "\n".join([f"{h['role']}: {h['content']}" for h in chat_history[-5:]])  # last 5 turns
    final_prompt = f"""
You are a helpful assistant. Use the context from the PDF and maintain conversation history.
Always include citations in the format [Source X].

Conversation so far:
{history_text}

Context:
{context}

User Question: {query}

Answer clearly and include citations at the end.
"""
    full_answer = gemini_generate(final_prompt)

    # 5. Separate Answer and Citations
    if "[" in full_answer:
        parts = full_answer.split("[")
        main_answer = parts[0].strip()
        citations = ["[" + p for p in parts[1:]]
    else:
        main_answer = full_answer.strip()
        citations = []

    # 6. Store Assistant Reply in Chat Memory
    chat_history.append({"role": "assistant", "content": main_answer})

    # 7. Display Answer & Citations
    print("🤖 Bot:", main_answer)
    if citations:
        print("📚 Citations:")
        for c in citations:
            print("-", c)

    # 8. TTS Output (Answer only)
    tts = gTTS(text=main_answer, lang="en")
    tts.save("output.mp3")
    display(Audio("output.mp3", autoplay=True))

    return main_answer, citations

# === Example Run (Text Input) ===
chatbot("Summarize the main topic of this PDF.")

# === Example Run (Follow-up with Chat History) ===
# chatbot("Explain it in simple terms.")

# === Example Run (Speech Input) ===
# chatbot("", use_speech=True)



Saving YONDELIS.pdf to YONDELIS.pdf
📄 PDF uploaded: YONDELIS.pdf


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  retrieved_docs = retriever.get_relevant_documents(query)


🤖 Bot: This PDF primarily discusses YONDELIS (trabectedin), an alkylating drug. It details its indications for treating unresectable or metastatic liposarcoma or leiomyosarcoma, its dosage and administration, and its chemical description
📚 Citations:
- [Source 2, Source 3]. The document also covers various clinical and nonclinical aspects such as its use in specific populations (e.g., pregnancy, pediatric, geriatric), pharmacology, toxicology, and storage 
- [Source 1].


('This PDF primarily discusses YONDELIS (trabectedin), an alkylating drug. It details its indications for treating unresectable or metastatic liposarcoma or leiomyosarcoma, its dosage and administration, and its chemical description',
 ['[Source 2, Source 3]. The document also covers various clinical and nonclinical aspects such as its use in specific populations (e.g., pregnancy, pediatric, geriatric), pharmacology, toxicology, and storage ',
  '[Source 1].'])

In [4]:
def chatbot(query, use_speech=False):
    global chat_history

    # 1. Speech Input Handling
    if use_speech:
        recognizer = sr.Recognizer()
        with sr.Microphone() as source:
            print("🎤 Speak now...")
            audio = recognizer.listen(source)
        try:
            query = recognizer.recognize_google(audio)
            print("🗣️ You said:", query)
        except:
            return "⚠️ Could not recognize speech"

    # 2. Maintain Chat Memory
    chat_history.append({"role": "user", "content": query})

    # 3. Retrieval
    retrieved_docs = retriever.get_relevant_documents(query)
    context = "\n".join([f"[Source {i+1}] {d.page_content}" for i, d in enumerate(retrieved_docs)])

    # 4. JSON-enforced Prompt
    final_prompt = f"""
You are a helpful assistant. Use the context below to answer the question.
Always return output in **strict JSON** with fields "answer" and "citations".

Context:
{context}

User Question: {query}

Return format:
{{
  "answer": "... main response ...",
  "citations": ["[Source 1]", "[Source 2]"]
}}
"""
    raw_response = gemini_generate(final_prompt)

    # 5. Parse JSON safely
    import json, re
    try:
        cleaned = re.search(r"\{.*\}", raw_response, re.S).group()
        parsed = json.loads(cleaned)
        main_answer = parsed["answer"]
        citations = parsed["citations"]
    except:
        main_answer = raw_response.strip()
        citations = []

    # 6. Store Assistant Reply
    chat_history.append({"role": "assistant", "content": main_answer})

    # 7. Display Answer + Citations
    print("🤖 Bot:", main_answer)
    if citations:
        print("📚 Citations:", citations)

    # 8. TTS Output
    tts = gTTS(text=main_answer, lang="en")
    tts.save("output.mp3")
    display(Audio("output.mp3", autoplay=True))

    return main_answer, citations

In [5]:
chatbot("What does this PDF talk about?")

📚 Citations: ['[Source 1]', '[Source 2]', '[Source 3]']


 ['[Source 1]', '[Source 2]', '[Source 3]'])

In [6]:


# Imports
import google.generativeai as genai
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from gtts import gTTS
from IPython.display import Audio, display
from google.colab import files
import json, re

# 🔹 Setup Gemini API (replace with your API key)


# === Step 1: Upload PDF (Drug Label, PI, etc.) ===
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print("📄 PDF uploaded:", pdf_path)

loader = PyPDFLoader(pdf_path)
docs = loader.load()

# === Step 2: Split & Embed ===
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
splits = splitter.split_documents(docs)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(splits, embedding=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 4})

# === Step 3: Memory ===
chat_history = []

# === Step 4: Gemini Wrapper ===
def gemini_generate(prompt):
    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text

# === Step 5: Caregiver Drug Chatbot ===
def drug_chatbot(query):
    global chat_history

    # Add query to memory
    chat_history.append({"role": "user", "content": query})

    # Retrieve docs
    retrieved_docs = retriever.get_relevant_documents(query)
    context = "\n".join([
        f"[Source {i+1}, Page {d.metadata.get('page', 'N/A')}] {d.page_content}"
        for i, d in enumerate(retrieved_docs)
    ])

    # Construct prompt (force JSON answer+citation)
    final_prompt = f"""
You are a drug information assistant for caregivers.
Answer using the prescribing information (dosage, administration, contraindications, interactions, warnings).
Keep answers clear, safe, and user-friendly.

Always return JSON with fields "answer" and "citations".

Conversation so far:
{chat_history}

Context:
{context}

User Question: {query}

Return format strictly:
{{
  "answer": "main response in plain English for caregivers",
  "citations": ["[Source X, Page N]"]
}}
"""
    raw_response = gemini_generate(final_prompt)

    # Parse JSON safely
    try:
        cleaned = re.search(r"\{.*\}", raw_response, re.S).group()
        parsed = json.loads(cleaned)
        answer = parsed["answer"]
        citations = parsed["citations"]
    except:
        answer = raw_response.strip()
        citations = []

    # Save bot reply to memory
    chat_history.append({"role": "assistant", "content": answer})

    # Print
    print("🤖 Bot:", answer)
    if citations:
        print("📚 Citations:", citations)

    # TTS
    tts = gTTS(text=answer, lang="en")
    tts.save("drug_answer.mp3")
    display(Audio("drug_answer.mp3", autoplay=True))

    return answer, citations

Saving YONDELIS.pdf to YONDELIS (1).pdf
📄 PDF uploaded: YONDELIS (1).pdf


In [7]:
# Q1: Dosage
drug_chatbot("What is the recommended dosage for Rinvoq?")

# Q2: Precautions
drug_chatbot("What should pregnant patients know about this drug?")

# Q3: Interactions
drug_chatbot("Does it interact with other common medications?")

🤖 Bot: I am sorry, but the provided prescribing information does not contain details for Rinvoq. The document discusses a medication called YONDELIS®.


🤖 Bot: Based on how YONDELIS works, it can cause harm to an unborn baby if taken during pregnancy. There is no information available from human studies regarding the use of YONDELIS in pregnant women. However, animal studies have shown that the drug can cross the placenta (meaning it can reach the fetus) in pregnant rats.

It is crucial that if a patient is pregnant, or suspects they may be pregnant, while taking YONDELIS, they should contact their healthcare provider right away.

To prevent potential harm to an unborn baby, females who can become pregnant should use effective birth control during treatment with YONDELIS and for at least 2 months after the last dose. Males with female partners who can become pregnant should use effective birth control during treatment and for at least 5 months after the last dose of YONDELIS.
📚 Citations: ['[Source 1, Page 10]', '[Source 2, Page 10]', '[Source 3, Page 6]', '[Source 4, Page 17]']


🤖 Bot: Yes, YONDELIS can interact with other medications. It is very important to avoid taking YONDELIS with certain types of drugs that can affect how your body processes it. These are called "strong CYP3A inhibitors" and "strong CYP3A inducers." 

Taking YONDELIS with **strong CYP3A inhibitors** (like certain antifungal medications such as ketoconazole, itraconazole, posaconazole, and voriconazole; certain antibiotics like clarithromycin and telithromycin; and some HIV medications like indinavir, lopinavir, and ritonavir) can increase the amount of YONDELIS in the body. This could lead to more side effects.

Conversely, taking YONDELIS with **strong CYP3A inducers** (like rifampin) can decrease the amount of YONDELIS in the body, which might make it less effective.

Always tell your healthcare provider about all medications, supplements, and herbal products your loved one is taking to ensure there are no harmful interactions.
📚 Citations: ['[Source 1, Page 0]', '[Source 2, Page 14]',

('Yes, YONDELIS can interact with other medications. It is very important to avoid taking YONDELIS with certain types of drugs that can affect how your body processes it. These are called "strong CYP3A inhibitors" and "strong CYP3A inducers." \n\nTaking YONDELIS with **strong CYP3A inhibitors** (like certain antifungal medications such as ketoconazole, itraconazole, posaconazole, and voriconazole; certain antibiotics like clarithromycin and telithromycin; and some HIV medications like indinavir, lopinavir, and ritonavir) can increase the amount of YONDELIS in the body. This could lead to more side effects.\n\nConversely, taking YONDELIS with **strong CYP3A inducers** (like rifampin) can decrease the amount of YONDELIS in the body, which might make it less effective.\n\nAlways tell your healthcare provider about all medications, supplements, and herbal products your loved one is taking to ensure there are no harmful interactions.',
 ['[Source 1, Page 0]', '[Source 2, Page 14]', '[Source