In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Run this in a Colab cell to install dependencies
!pip install openai faiss-cpu streamlit langchain pandas tqdm nltk


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m125.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, pydeck, streamlit
Successfully installed fai

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv(r"/content/drive/MyDrive/Personal OA/First500days/medquad.csv")
# Basic inspection
print(df.head())

                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  


In [9]:
#3. Preprocessing & Chunking

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from tqdm import tqdm

#checking columns: 'Question', 'Answer'
df = df[['question', 'answer']].dropna()

# Combining Question and Answer for context, text chunking
docs = []
for q, a in zip(df['question'], df['answer']):
    docs.append(f"Q: {q}\nA: {a}")

# splitting long contexts into smaller chunks for better retrieval
def chunk_text(text, chunk_size=256):
    sentences = nltk.sent_tokenize(text)
    chunks, chunk = [], ""
    for sent in sentences:
        if len(chunk) + len(sent) < chunk_size:
            chunk += " " + sent
        else:
            chunks.append(chunk.strip())
            chunk = sent
    if chunk: chunks.append(chunk.strip())
    return chunks

chunks = []
for doc in docs:
    chunks.extend(chunk_text(doc))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
# 4: chunking
from nltk import sent_tokenize

def chunk_text(text, max_words=200):
    """Splitting text into chunks of approximately max_words (by word count)."""
    sents = sent_tokenize(text)
    chunks = []
    cur = ""
    cur_words = 0
    for s in sents:
        w = len(s.split())
        if cur_words + w <= max_words:
            cur = (cur + " " + s).strip()
            cur_words += w
        else:
            if cur:
                chunks.append(cur.strip())
            cur = s
            cur_words = w
    if cur:
        chunks.append(cur.strip())
    return chunks

# Create chunk rows
rows = []
# Combine 'question' and 'answer' into a new 'text' column
df['text'] = df['question'] + ' ' + df['answer']

for _, r in df.iterrows():
    chunks = chunk_text(r['text'], max_words=200)
    for i, c in enumerate(chunks):
        rows.append({
            'doc_id': _, # Using index as doc_id for simplicity
            'chunk_id': f"{_}_{i}",
            'question': r['question'],
            'text': c,
        })

chunk_df = pd.DataFrame(rows)
print("Total chunks:", len(chunk_df))
display(chunk_df.head(3))

Total chunks: 26865


Unnamed: 0,doc_id,chunk_id,question,text
0,0,0_0,What is (are) Glaucoma ?,What is (are) Glaucoma ? What is (are) Glaucom...
1,0,0_1,What is (are) Glaucoma ?,"In the normal eye, the clear fluid leaves the ..."
2,1,1_0,What causes Glaucoma ?,What causes Glaucoma ? What causes Glaucoma ? ...


In [13]:
#!pip install InstructorEmbedding

In [14]:
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm

# ✅ Use a small, fast model (good balance of speed & quality)
MODEL_NAME = "paraphrase-MiniLM-L3-v2"

# Load model (use GPU if available, else CPU)
embed_model = SentenceTransformer(MODEL_NAME, device="cuda" if torch.cuda.is_available() else "cpu")

# Your text chunks (replace with your own)
texts = chunk_df['text'].tolist()

print("Encoding embeddings... this will be much faster now 🚀")

# Batch encode (default batch_size=32, increase if you have GPU)
embeddings = embed_model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # makes cosine similarity easier
)

print("✅ Embeddings created. Shape:", embeddings.shape)



modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding embeddings... this will be much faster now 🚀


Batches:   0%|          | 0/420 [00:00<?, ?it/s]

✅ Embeddings created. Shape: (26865, 384)


In [19]:
#Retrieval helper : A function to search by query, return top-k chunks (text + score + source).

# Colab cell 6: retrieval function
import faiss
import math
import heapq
import openai
import json
import numpy as np # Import numpy

# loading index + metadata
# index = faiss.read_index("medquad_faiss.index")
# with open("medquad_meta.pkl","rb") as f: chunk_df = pickle.load(f)

# Ensure embeddings is a numpy array of float32
embeddings = np.array(embeddings).astype('float32')

# Get embedding dimension
d = embeddings.shape[1]

# Build a Faiss index (using IndexFlatL2 for Euclidean distance and IndexFlatIP for cosine similarity)
# Since we normalized embeddings, cosine similarity is equivalent to dot product (IndexFlatIP)
index = faiss.IndexFlatIP(d)

# Adding embeddings to the index
index.add(embeddings)
print("Index built. Total vectors:", index.ntotal)

def retrieve(query, top_k=5):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    # q_emb = normalize(q_emb) # Removed the unnecessary normalize call
    D, I = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        row = chunk_df.iloc[idx]
        results.append({
            'chunk_id': row['chunk_id'],
            'doc_id': row['doc_id'],
            'question': row['question'],
            'text': row['text'],
            'score': float(score)
        })
    return results

# quick check
print("Top 3 for 'early symptoms of diabetes':")
for r in retrieve("What are the early symptoms of diabetes?", top_k=3):
    print(r['score'], r['question'])
    print(r['text'][:300], "...\n")

Index built. Total vectors: 26865
Top 3 for 'early symptoms of diabetes':
3.4065585136413574 What are the symptoms of Diabetes ?
What are the symptoms of Diabetes ? What are the symptoms of Diabetes ? Many people with diabetes experience one or more symptoms, including extreme thirst or hunger, a frequent need to urinate and/or fatigue. Some lose weight without trying. Additional signs include sores that heal slowly, dry, itc ...

2.9763565063476562 What are the symptoms of Prevent diabetes problems: Keep your heart and blood vessels healthy ?
                
- chest pain or discomfort  - pain ...

2.951882839202881 What is (are) Diabetes ?
The good news is that if you have prediabetes, you can reduce your risk of getting type 2 diabetes. With modest weight loss and moderate physical activity, you can delay or prevent type 2 diabetes. Learn more about prediabetes here. Signs of Diabetes Many people with diabetes experience one or more  ...



In [21]:
!pip install Groq

Collecting Groq
  Downloading groq-0.31.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.31.1-py3-none-any.whl (134 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Groq
Successfully installed Groq-0.31.1


In [22]:
# Setting the Groq API key directly
import os
from groq import Groq

#Groq API key here
os.environ["GROQ_API_KEY"] = "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"

# Initialize client
client = Groq(api_key=os.environ["GROQ_API_KEY"])

print(" Groq API key set and client initialized.")


 Groq API key set and client initialized.


In [88]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

# Load embeddings model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text(text):
    return embed_model.encode(text).tolist()




In [89]:
def build_context_text(retrieved, max_chars_per_chunk=1000):
    pieces = []
    for i, r in enumerate(retrieved):
        txt = r['text'][:max_chars_per_chunk]  # truncate to avoid huge prompt
        pieces.append(
            f"[Source {i+1} | doc:{r['doc_id']} | score:{r['score']:.3f}]\nQ: {r['question']}\n{txt}"
        )
    context_text = "\n\n---\n\n".join(pieces)
    return context_text

def generate_answer_groq(user_question, retrieved, model="all-MiniLM-L6-v2", max_tokens=300, temperature=0.0):
    context_text = build_context_text(retrieved)
    system_msg = (
        "You are an assistant that answers medical questions using ONLY the provided context. "
        "DO NOT hallucinate or add any facts not present in the context. "
        "If the context does not have enough information, say you don't know and recommend consulting a qualified healthcare professional. "
        "Keep answers clear, concise, and cite the source indices like [Source 1], [Source 2]."
    )
    user_prompt = f"User question: {user_question}\n\nContext (use only this):\n{context_text}\n\nAnswer concisely. If the context isn't sufficient, say you don't have enough information and advise seeing a medical professional."

    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            n=1
        )
        ans = resp.choices[0].message.content.strip()
    except Exception as e:
        ans = f"[Groq API error] {e}"
    return ans

In [90]:
#Type a question : interactive loop
def chat_loop(top_k=4):
    print("RAG Medical Chatbot — type 'exit' to quit.")
    while True:
        q = input("\nYou: ").strip()
        if not q:
            continue
        if q.lower() in ('exit','quit'):
            print("Bye")
            break
        retrieved = retrieve(q, top_k=top_k)
        if len(retrieved) == 0:
            print("No relevant context found in KB.")
            continue
        answer = generate_answer_groq(q, retrieved)
        print("\nAssistant:", answer)
        print("\n--- Retrieved sources (top results) ---")
        for i, r in enumerate(retrieved, 1):
            print(f"[Source {i}] doc_id={r['doc_id']} score={r['score']:.3f}")
            print(r['text'][:400].replace("\n"," ") + ("..." if len(r['text'])>400 else ""))
            print("")

In [91]:
chat_loop(top_k=5)  # Run the chat loop

RAG Medical Chatbot — type 'exit' to quit.

You: what is sickle cell disease?

Assistant: [Groq API error] Error code: 404 - {'error': {'message': 'The model `all-minilm-l6-v2` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'code': 'model_not_found'}}

--- Retrieved sources (top results) ---
[Source 1] doc_id=8507 score=0.266
Your doctor also may set your LDL goal at this lower level if you have heart disease alone. After following the above steps, you should have an idea about your risk for heart disease and heart attack. The two main ways to lower your cholesterol (and, thus, your heart disease risk) include:                  Therapeutic Lifestyle Changes (TLC). TLC is a three-part program that includes a healthy die...

[Source 2] doc_id=16380 score=0.256
- rheumatoid arthritisa disease that causes pain, swelling, stiffness, and loss of function in the joints when the immune system attacks the membrane lining the joints. - psoriasisa skin disease 

In [112]:
print(df.columns)

Index(['question', 'answer', 'text'], dtype='object')


In [130]:
%%writefile app.py
# app.py
import os
import streamlit as st
import pandas as pd
import faiss
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer


# Disable Streamlit file watching in Colab
os.environ["STREAMLIT_SERVER_FILE_WATCHER_TYPE"] = "none"
os.environ["STREAMLIT_DISABLE_FILE_WATCHER"] = "true"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"


# -------------------------
# Setup
# -------------------------
st.set_page_config(page_title="🩺 Medical RAG Chatbot")
st.title("🩺 Medical RAG Chatbot")
st.markdown("Type your medical question below and get an AI-powered answer.")

# Set your Groq/OpenAI key
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Load embeddings model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------------
# Load dataset (MedQuad)
# -------------------------
@st.cache_resource
def load_data():
    df = pd.read_csv(r"/content/drive/MyDrive/Personal OA/First500days/medquad.csv")   # putting dataset link
    df.columns = ["question", "answer", "text", "score"]
    return df

df = load_data()

# -------------------------
# Build FAISS index
# -------------------------
@st.cache_resource
def build_faiss():
    embeddings = embedder.encode(df["question"].tolist(), convert_to_numpy=True)  # lowercase
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index, embeddings

index, embeddings = build_faiss()

# -------------------------
# Retrieval function
# -------------------------
def retrieve_context(query, top_k=3):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_emb, top_k)
    results = []
    for idx in I[0]:
        q = df.iloc[idx]["question"]   # lowercase
        a = df.iloc[idx]["answer"]     # lowercase
        results.append(f"Q: {q}\nA: {a}")
    return "\n\n".join(results)
# -------------------------
# RAG pipeline
# -------------------------
def rag_answer(query):
    context = retrieve_context(query)

    prompt = f"""
    You are a helpful medical assistant. Use the context below to answer the question.
    If the answer is not found, say "I don’t know, please consult a doctor."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",   # Groq supports same schema, swap model if needed
        messages=[{"role": "system", "content": "You are a medical assistant."},
                  {"role": "user", "content": prompt}],
        temperature=0.3,
    )

    return response.choices[0].message["content"]

# -------------------------
# Chat UI
# -------------------------

import os
os.environ["STREAMLIT_DISABLE_FILE_WATCHER"] = "true"

import streamlit as st

if "messages" not in st.session_state:
    st.session_state.messages = []


st.write("Type your medical question below:")

# Display chat history
for msg in st.session_state.messages:
    if msg["role"] == "user":
        st.markdown(f"**You:** {msg['content']}")
    else:
        st.markdown(f"**Assistant:** {msg['content']}")

# Chat input (textbox at bottom)
user_input = st.chat_input("Ask a medical question:")

if user_input:
    st.session_state.messages.append({"role": "user", "content": user_input})

    # Dummy response for now (replace with rag_answer)
    answer = f"Sample answer for: {user_input}"

    st.session_state.messages.append({"role": "assistant", "content": answer})

    st.experimental_rerun()


Overwriting app.py


In [132]:
!python app.py

2025-09-13 06:00:42.915554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757743242.935790   27719 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757743242.941955   27719 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757743242.957320   27719 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757743242.957346   27719 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757743242.957350   27719 computation_placer.cc:177] computation placer alr

In [133]:
!npm install -g localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K
changed 22 packages in 687ms
[1G[0K⠴[1G[0K
[1G[0K⠴[1G[0K3 packages are looking for funding
[1G[0K⠴[1G[0K  run `npm fund` for details
[1G[0K⠴[1G[0K

In [134]:
!wget -q -O - ipv4.icanhazip.com

34.125.158.177


In [135]:
#!streamlit run app.py --server.port 8501 --server.headless true
!streamlit run app.py --server.port 8501 --server.headless true & npx localtunnel --port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.158.177:8501[0m
[0m
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0Kyour url is: https://early-ghosts-build.loca.lt
2025-09-13 06:01:54.618 Failed to schedule watch observer for path /content
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/streamlit/watcher/event_based_path_watcher.py", line 186, in watch_path
    folder_handler.watch = self._observer.schedule(
                           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/watchdog/observers/api.py", line 312, in schedule
    emitter.start()
  File "/usr/local/lib/python3.12/dist-packages/watchdog/utils/__init_