In [None]:
!pip install -q chromadb sentence-transformers spacy pandas gradio
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import os
import re
import pandas as pd
import spacy

from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.utils import embedding_functions




In [None]:
nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded")


spaCy model loaded


In [None]:
data = [
    {"question": "What are Cardiology OPD timings?", "answer": "Cardiology OPD runs from 9 AM to 5 PM, Monday to Saturday."},
    {"question": "How can I book an appointment?", "answer": "Appointments can be booked online via hospital website or at the reception desk."},
    {"question": "What are ICU visiting hours?", "answer": "ICU visiting hours are from 4 PM to 5 PM. Only two visitors are allowed."},
    {"question": "Is emergency service available 24/7?", "answer": "Yes, emergency services are available 24/7 at the hospital."},
    {"question": "What documents are needed for admission?", "answer": "You need a valid ID proof and previous medical records for admission."}
]

df = pd.DataFrame(data)
df


Unnamed: 0,question,answer
0,What are Cardiology OPD timings?,"Cardiology OPD runs from 9 AM to 5 PM, Monday ..."
1,How can I book an appointment?,Appointments can be booked online via hospital...
2,What are ICU visiting hours?,ICU visiting hours are from 4 PM to 5 PM. Only...
3,Is emergency service available 24/7?,"Yes, emergency services are available 24/7 at ..."
4,What documents are needed for admission?,You need a valid ID proof and previous medical...


In [None]:
texts = df["answer"].astype(str).tolist()
metadatas = [{"question": q} for q in df["question"].astype(str).tolist()]
ids = [f"doc_{i}" for i in range(len(texts))]

print("Documents prepared:", len(texts))


Documents prepared: 5


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model loaded


In [None]:
embeddings = model.encode(texts, show_progress_bar=True).tolist()
print("Embeddings created")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings created


In [None]:
from chromadb import Client

client = Client()
collection_name = "hospital_faqs"

# delete collection if already exists
try:
    client.delete_collection(collection_name)
except:
    pass

# create collection WITHOUT embedding_function
collection = client.create_collection(name=collection_name)

# add documents with precomputed embeddings
collection.add(
    ids=ids,
    documents=texts,
    metadatas=metadatas,
    embeddings=embeddings
)

print("Chroma collection created with", len(ids), "documents")


Chroma collection created with 5 documents


In [None]:
PII_PATTERNS = [
    re.compile(r"\b\d{10}\b"),      # phone number
    re.compile(r"\b\d{12}\b"),      # Aadhaar-like
    re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
]

REFUSAL_MESSAGE = "Sorry, I cannot share personal or clinical information due to privacy rules."

def redact_pii(text):
    for p in PII_PATTERNS:
        text = p.sub("[REDACTED]", text)
    return text

def has_person_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return True
    return False


In [None]:
def retrieve_answer(user_query, k=2):
    cleaned = redact_pii(user_query)

    if has_person_name(cleaned):
        return REFUSAL_MESSAGE

    query_embedding = model.encode([cleaned]).tolist()

    results = collection.query(
        query_embeddings=query_embedding,
        n_results=k
    )

    docs = results["documents"][0]

    if not docs:
        return "Sorry, I could not find relevant hospital information."

    return docs[0]


In [None]:
print(retrieve_answer("What are Cardiology OPD timings?"))
print(retrieve_answer("Tell me Rahul Rawat ward number"))
print(retrieve_answer("Is emergency available?"))


Cardiology OPD runs from 9 AM to 5 PM, Monday to Saturday.
You need a valid ID proof and previous medical records for admission.
Yes, emergency services are available 24/7 at the hospital.
