In [None]:
# Cell 1: Upload Pharmacy PDF (or confirm existing) and list /content files
from google.colab import files
import glob, os, pprint

print("If you haven't uploaded yet, choose the PDF now.")
uploaded = files.upload()  # use dialog to upload Pharmacy Dictionary.pdf

print("\nFiles currently in /content (top-level):")
files_content = sorted(os.listdir("/content"))
pprint.pprint(files_content)

print("\nAny file matching pharmacy/pdf patterns:")
matches = glob.glob("/content/*pharm*.pdf") + glob.glob("/content/*Pharm*.pdf") + glob.glob("/content/*Pharmacy*.pdf")
pprint.pprint(matches)

# If you prefer Drive, show any matching files in Drive root too
drive_matches = glob.glob("/content/drive/MyDrive/*pharm*.pdf") + glob.glob("/content/drive/MyDrive/*Pharmacy*.pdf")
print("\nDrive (MyDrive) matches (if any):")
pprint.pprint(drive_matches)


If you haven't uploaded yet, choose the PDF now.


Saving Pharmacy Dictionary.pdf to Pharmacy Dictionary (2).pdf

Files currently in /content (top-level):
['.config',
 'Pharmacy Dictionary (1).pdf',
 'Pharmacy Dictionary (2).pdf',
 'Pharmacy Dictionary.pdf',
 'drive',
 'pharmacy_glossary.txt',
 'sample_data']

Any file matching pharmacy/pdf patterns:
['/content/Pharmacy Dictionary (2).pdf',
 '/content/Pharmacy Dictionary (1).pdf',
 '/content/Pharmacy Dictionary.pdf',
 '/content/Pharmacy Dictionary (2).pdf',
 '/content/Pharmacy Dictionary (1).pdf',
 '/content/Pharmacy Dictionary.pdf']

Drive (MyDrive) matches (if any):
['/content/drive/MyDrive/Pharmacy Dictionary.pdf']


In [None]:
# Fix: download required NLTK resources then continue chunking + embed + FAISS
import nltk, sys, os, glob
print("NLTK data path(s):", nltk.data.path)

# Try to download both punkt and punkt_tab (punkt_tab sometimes required)
for pkg in ("punkt", "punkt_tab"):
    try:
        print("Downloading", pkg, "...")
        nltk.download(pkg, quiet=False)
    except Exception as e:
        print("Could not download", pkg, ":", e)

# Verify tokenizer availability
from nltk.tokenize import sent_tokenize
try:
    _ = sent_tokenize("This is a test. This is only a test.")
    print("sent_tokenize is available.")
except Exception as e:
    print("sent_tokenize still failing:", e)
    raise

# If you ran the previous long cell that extracted glossary_text, we reuse it.
if 'glossary_text' not in globals():
    # try to load extracted text file if present
    if os.path.exists("/content/pharmacy_glossary.txt"):
        with open("/content/pharmacy_glossary.txt", "r", encoding="utf-8", errors="ignore") as f:
            glossary_text = f.read()
        print("Loaded /content/pharmacy_glossary.txt (chars):", len(glossary_text))
    else:
        raise FileNotFoundError("glossary_text not found in globals and /content/pharmacy_glossary.txt missing. Re-run PDF extraction cell first.")

# Re-run chunking + embedding + index creation (the same steps as before)
import re, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

def clean_text(s):
    s = s.replace('\r',' ').replace('\t',' ')
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[^\x00-\x7F]+', ' ', s)
    s = re.sub(r'\b(\w+)(?:\s+\1){2,}\b', r'\1', s, flags=re.IGNORECASE)
    s = re.sub(r'\b(Para)(?:\s+\1){1,}', r'\1', s, flags=re.IGNORECASE)
    s = s.strip(' .,:;-/')
    return s.strip()

from nltk.tokenize import sent_tokenize

def chunk_text_into_passages(text, max_chars=700, overlap=120):
    sents = sent_tokenize(text)
    chunks, cur = [], ""
    for sent in sents:
        sent = clean_text(sent)
        if not sent:
            continue
        if len(cur) + len(sent) + 1 <= max_chars:
            cur = (cur + " " + sent).strip()
        else:
            if cur:
                chunks.append(cur)
            if len(sent) > max_chars:
                for i in range(0, len(sent), max_chars - overlap):
                    chunks.append(sent[i:i+max_chars].strip())
                cur = ""
            else:
                cur = sent
    if cur:
        chunks.append(cur)
    chunks = [re.sub(r'\s+',' ', c).strip() for c in chunks if len(c.strip())>40]
    return chunks

print("Chunking and cleaning text (again)...")
passages = chunk_text_into_passages(glossary_text, max_chars=700, overlap=120)
print("Created passages:", len(passages))

print("Encoding passages with all-MiniLM-L6-v2...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embs = embed_model.encode(passages, convert_to_numpy=True, show_progress_bar=True)

d = embs.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embs)
print("Built FAISS index with", index.ntotal, "vectors (dim:", d, ")")

# Save artifacts
out_dir = "/content/pharmabot_work"
os.makedirs(out_dir, exist_ok=True)
pd.DataFrame({"text": passages}).to_csv(os.path.join(out_dir, "cleaned_chunks.csv"), index=False)
np.save(os.path.join(out_dir, "cleaned_embs.npy"), embs)
faiss.write_index(index, os.path.join(out_dir, "faiss_cleaned.bin"))
print("Saved cleaned artifacts to:", out_dir)
print("Files:", os.listdir(out_dir))

# If Drive available, attempt to save there too
drive_dir = "/content/drive/MyDrive/pharmabot_cleaned"
try:
    os.makedirs(drive_dir, exist_ok=True)
    pd.DataFrame({"text": passages}).to_csv(os.path.join(drive_dir, "cleaned_chunks.csv"), index=False)
    np.save(os.path.join(drive_dir, "cleaned_embs.npy"), embs)
    faiss.write_index(index, os.path.join(drive_dir, "faiss_cleaned.bin"))
    print("Also saved artifacts to Drive:", drive_dir)
    print("Drive files:", os.listdir(drive_dir))
except Exception as e:
    print("Could not save to Drive (ok):", e)

# Print sample passages
print("\nSample passages (first 5):")
for i, p in enumerate(passages[:5]):
    print(f"\n--- passage #{i} ---\n{p[:800]}\n")

print("\nAll done ‚Äî paste the output here or say 'done ‚úÖ' if successful.")


NLTK data path(s): ['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
Downloading punkt ...
Downloading punkt_tab ...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


sent_tokenize is available.
Chunking and cleaning text (again)...
Created passages: 513
Encoding passages with all-MiniLM-L6-v2...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Built FAISS index with 513 vectors (dim: 384 )
Saved cleaned artifacts to: /content/pharmabot_work
Files: ['faiss_cleaned.bin', 'cleaned_embs.npy', 'cleaned_chunks.csv']
Also saved artifacts to Drive: /content/drive/MyDrive/pharmabot_cleaned
Drive files: ['cleaned_chunks.csv', 'cleaned_embs.npy', 'faiss_cleaned.bin']

Sample passages (first 5):

--- passage #0 ---
Glossary of Pharmaceutical Terms Update: 2016 Glossary of the WHO Collaborating Centre for Pharmaceutical Pricing and Reimbursement Policies Update: July 2016 Authors: Sabine Vogler Nina Zimmermann Supported by: Margit Gombocz In coordination with the Pharmaceutical Pricing an d Reimbursement Information (PPRI) network and the World Health O rganization (WHO) Project Assistant : Romana Landauer Vienna , July 2016 Owner and Edito r: Gesundheit sterreich GmbH, Stubenring 6, A-1010 Vienna , Phone +43 1 515 61, F ax 513 84 72, Homepage: www.goeg.at For our enviro nment : This report has been printed on paper produced without chlo

In [None]:
#Load, test, and chat
# Cell 3: Load cleaned artifacts, show samples, and open interactive chat loop
import os, pandas as pd, numpy as np, faiss
from sentence_transformers import SentenceTransformer
import textwrap

local_dir = "/content/pharmabot_work"
drive_dir = "/content/drive/MyDrive/pharmabot_cleaned"

print("Local dir files:", os.listdir(local_dir))
print("Drive dir files (if available):", os.listdir(drive_dir) if os.path.exists(drive_dir) else "Drive folder not found")

# Load cleaned chunks and index
df = pd.read_csv(os.path.join(local_dir, "cleaned_chunks.csv"))
clean_chunks = df['text'].astype(str).tolist()
print("Loaded cleaned chunks count:", len(clean_chunks))

clean_embs = np.load(os.path.join(local_dir, "cleaned_embs.npy"))
clean_index = faiss.read_index(os.path.join(local_dir, "faiss_cleaned.bin"))

print("Cleaned embeddings shape:", clean_embs.shape)
print("FAISS index vectors:", clean_index.ntotal)

# Show first 5 cleaned chunks (readable)
print("\n=== First 5 cleaned chunks ===")
for i, txt in enumerate(clean_chunks[:5]):
    print(f"\n--- chunk #{i} ---")
    print(textwrap.fill(txt, width=100))

# Load encoder for queries (same small model used during indexing)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_clean(query, k=4):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    D, I = clean_index.search(q_emb, k)
    results = []
    for dist, idx in zip(D[0], I[0]):
        results.append({"idx": int(idx), "score": float(dist), "text": clean_chunks[idx]})
    return results

def safe_answer_clean(query, k=4):
    med_kw = ["dose","dosage","prescribe","prescription","should i","how much",
              "pregnant","tablet","mg","ml","take"]
    if any(w in query.lower() for w in med_kw):
        return "‚ö†Ô∏è I cannot provide dosage, prescription, or diagnostic advice. Please consult a licensed pharmacist or doctor."
    hits = retrieve_clean(query, k=k)
    if not hits:
        return "No relevant information found in the knowledge base."
    out = "Here‚Äôs what I found:\n\n"
    for h in hits:
        snippet = h['text'].replace("\n"," ").strip()
        # tidy to sentence boundary if possible
        if len(snippet) > 350:
            cut = snippet.rfind('.', 0, 350)
            snippet = snippet[:cut+1] if cut>50 else snippet[:347]+"..."
        out += f"[source {h['idx']}] {snippet}\n\n"
    out += "Note: Informational only ‚Äî not medical advice."
    return out

# Quick sanity tests (printed)
print("\n=== Sanity checks ===")
tests = ["What is paracetamol?", "How should paracetamol be stored?", "What dose of paracetamol should I take?", "Define excipient."]
for t in tests:
    print("\nQ:", t)
    ans = safe_answer_clean(t)
    print("A sample:", ans[:400], ("\n(REFUSED)" if "cannot provide" in ans else ""))

# Interactive chat loop
print("\nüí¨ Pharmabot (clean KB) ‚Äî type 'exit' to stop the chat loop\n")
while True:
    try:
        q = input("You: ").strip()
    except EOFError:
        print("\nInput closed. Exiting.")
        break
    if not q:
        continue
    if q.lower() in ("exit","quit","bye"):
        print("Bot: Goodbye! üëã")
        break
    print("\nBot:", safe_answer_clean(q), "\n")


Local dir files: ['faiss_cleaned.bin', 'cleaned_embs.npy', 'cleaned_chunks.csv']
Drive dir files (if available): ['cleaned_chunks.csv', 'cleaned_embs.npy', 'faiss_cleaned.bin']
Loaded cleaned chunks count: 513
Cleaned embeddings shape: (513, 384)
FAISS index vectors: 513

=== First 5 cleaned chunks ===

--- chunk #0 ---
Glossary of Pharmaceutical Terms Update: 2016 Glossary of the WHO Collaborating Centre for
Pharmaceutical Pricing and Reimbursement Policies Update: July 2016 Authors: Sabine Vogler Nina
Zimmermann Supported by: Margit Gombocz In coordination with the Pharmaceutical Pricing an d
Reimbursement Information (PPRI) network and the World Health O rganization (WHO) Project Assistant
: Romana Landauer Vienna , July 2016 Owner and Edito r: Gesundheit sterreich GmbH, Stubenring 6,
A-1010 Vienna , Phone +43 1 515 61, F ax 513 84 72, Homepage: www.goeg.at For our enviro nment :
This report has been printed on paper produced without chlorine bleaching and optical brighteners

--- c

In [None]:
# === Replace retrieval with cross-encoder re-ranking and test ===
!pip install -q -U sentence-transformers

from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss, numpy as np, pandas as pd, os, re, textwrap

pdir = "/content/pharmabot_work"
chunks_csv = os.path.join(pdir, "cleaned_chunks.csv")
index_path = os.path.join(pdir, "faiss_cleaned.bin")

# load data/index
df = pd.read_csv(chunks_csv)
chunks = df['text'].astype(str).tolist()
index = faiss.read_index(index_path)
print("Loaded", len(chunks), "chunks and FAISS index with", index.ntotal, "vectors")

# load models
query_encoder = SentenceTransformer("all-MiniLM-L6-v2")   # fast encoder for FAISS search
cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"  # small, fast re-ranker
cross_encoder = CrossEncoder(cross_model_name)
print("Loaded query encoder and cross-encoder re-ranker.")

def improved_retrieve(query, top_k=5, fetch_k=80, boost_exact=True):
    # 1) FAISS fetch
    q_emb = query_encoder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, min(fetch_k, index.ntotal))
    cand_idx = [int(i) for i in I[0] if i != -1]
    candidates = [chunks[i] for i in cand_idx]
    if len(candidates) == 0:
        return []
    # 2) cross-encoder scoring
    pairs = [[query, c] for c in candidates]
    scores = cross_encoder.predict(pairs)
    # 3) optional exact token boost
    if boost_exact:
        q_tokens = re.findall(r"[A-Za-z0-9\-\+]+", query.lower())
        boosts = []
        for c in candidates:
            lower = c.lower()
            boost = 0.0
            for t in q_tokens:
                if re.search(rf"\b{re.escape(t)}\b", lower):
                    boost += 0.25
            boosts.append(boost)
        scores = scores + np.array(boosts)
    # 4) rank and return
    ranked = np.argsort(-scores)[:top_k]
    results = []
    for r in ranked:
        results.append({"idx": cand_idx[r], "score": float(scores[r]), "text": candidates[r]})
    return results

# Updated safe answer to use improved_retrieve
def safe_answer_rerank(query, k=4):
    med_kw = ["dose","dosage","prescribe","prescription","should i","how much",
              "pregnant","tablet","mg","ml","take"]
    if any(w in query.lower() for w in med_kw):
        return "‚ö†Ô∏è I cannot provide dosage, prescription, or diagnostic advice. Please consult a licensed pharmacist or doctor."
    hits = improved_retrieve(query, top_k=k, fetch_k=120, boost_exact=True)
    if not hits:
        return "No relevant information found in the knowledge base."
    out = "Here‚Äôs what I found (re-ranked):\n\n"
    for h in hits:
        snippet = h['text'].replace("\n"," ").strip()
        # trim to sentence boundary
        if len(snippet) > 350:
            cut = snippet.rfind('.', 0, 350)
            snippet = snippet[:cut+1] if cut>40 else snippet[:347]+"..."
        out += f"[source {h['idx']}] {snippet}\n\n"
    out += "Note: Informational only ‚Äî not medical advice."
    return out

# Quick test queries
test_qs = ["What is paracetamol?", "How should paracetamol be stored?", "Define excipient."]
for q in test_qs:
    print("\n=== Query:", q)
    ans = safe_answer_rerank(q, k=3)
    print(textwrap.fill(ans, width=120))

print("\nRe-ranker ready. You can now call safe_answer_rerank(query) in the chat loop or Gradio UI.")


Loaded 513 chunks and FAISS index with 513 vectors


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Loaded query encoder and cross-encoder re-ranker.

=== Query: What is paracetamol?
Here‚Äôs what I found (re-ranked):  [source 327] The major companies are represented at EU level by the trade association
European Association of Euro-Pharmaceutical Companies (EAEPC ), but there are a relatively large number of others
holding licences about whom less i s known [Source: Europe Economics Safe Medicines Through P arallel Trade Contribution
to an Impact Assessment] Paramedicines ...  [source 380] the medicine is intended for  out-patients but its use may
produce very serious adverse reactions requiring a prescription drawn up as required by a specialist and special
supervision throughout the treatment  [source 147] See also:  compounding, magistral formula (extemporaneous pre
paration), officinal formula [Source: Brion F, Nunn AJ, Rieutord A. Extemporaneo us (magistral) preparation of oral
medicines for children in European Hospitals Acta Paediatr  Note: Informational only ‚Äî not medical

In [3]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss, numpy as np, pandas as pd, os, re

# Paths
pdir = "/content/pharmabot_work"
chunks_csv = os.path.join(pdir, "cleaned_chunks.csv")
index_path = os.path.join(pdir, "faiss_cleaned.bin")

# Load data
df = pd.read_csv(chunks_csv)
chunks = df['text'].astype(str).tolist()
index = faiss.read_index(index_path)

# Models
query_encoder = SentenceTransformer("all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Rerank-based retrieval
def improved_retrieve(query, top_k=5, fetch_k=80, boost_exact=True):
    q_emb = query_encoder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, min(fetch_k, index.ntotal))
    cand_idx = [int(i) for i in I[0] if i != -1]
    candidates = [chunks[i] for i in cand_idx]
    if not candidates:
        return []
    pairs = [[query, c] for c in candidates]
    scores = cross_encoder.predict(pairs)
    if boost_exact:
        q_tokens = re.findall(r"[A-Za-z0-9\-]+", query.lower())
        for i, c in enumerate(candidates):
            if any(t in c.lower() for t in q_tokens):
                scores[i] += 0.3
    ranked = np.argsort(-scores)[:top_k]
    return [{"idx": cand_idx[i], "score": float(scores[i]), "text": candidates[i]} for i in ranked]

# Safe wrapper
def safe_answer_rerank(query, k=4):
    med_kw = ["dose","dosage","prescribe","prescription","should i","how much",
              "pregnant","tablet","mg","ml","take"]
    if any(w in query.lower() for w in med_kw):
        return "‚ö†Ô∏è I cannot provide dosage or prescription advice. Please consult a pharmacist or doctor."
    hits = improved_retrieve(query, top_k=k, fetch_k=120)
    if not hits:
        return "No relevant information found."
    out = "Here‚Äôs what I found:\n\n"
    for h in hits:
        snippet = h['text'].replace("\n"," ").strip()
        out += f"[source {h['idx']}] {snippet[:400]}...\n\n"
    out += "Note: Informational only ‚Äî not medical advice."
    return out


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
import gradio as gr

def gr_chat(user_msg):
    return safe_answer_rerank(user_msg, k=4)

iface = gr.Interface(
    fn=gr_chat,
    inputs="text",
    outputs="text",
    title="üíä Pharmabot (Re-ranked KB)",
    description="Ask any pharmacy-related question ‚Äî refuses dosage/prescription advice."
)
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5c2a4a03d47c09ec13.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [5]:
#Verify the index and chunks are loaded correctly
print("Chunks loaded:", len(chunks))
print("First chunk preview:\n", chunks[0][:300])


Chunks loaded: 513
First chunk preview:
 Glossary of Pharmaceutical Terms Update: 2016 Glossary of the WHO Collaborating Centre for Pharmaceutical Pricing and Reimbursement Policies Update: July 2016 Authors: Sabine Vogler Nina Zimmermann Supported by: Margit Gombocz In coordination with the Pharmaceutical Pricing an d Reimbursement Inform


In [7]:
#Add a fallback retrieval if no hits are found
def safe_answer_rerank(query, k=4):
    med_kw = ["dose","dosage","prescribe","prescription","should i","how much",
              "pregnant","tablet","mg","ml","take"]
    if any(w in query.lower() for w in med_kw):
        return "‚ö†Ô∏è I cannot provide dosage or prescription advice. Please consult a pharmacist or doctor."

    hits = improved_retrieve(query, top_k=k, fetch_k=150)
    if not hits or len(hits) == 0:
        # fallback partial match search
        matches = [c for c in chunks if "paracetamol" in c.lower() or "acetaminophen" in c.lower()]
        if matches:
            return "Fallback match:\n\n" + matches[0][:400] + "\n\nNote: Informational only ‚Äî not medical advice."
        return "No relevant information found in the knowledge base."

    out = "Here‚Äôs what I found:\n\n"
    for h in hits:
        snippet = h['text'].replace("\n"," ").strip()
        out += f"[source {h['idx']}] {snippet[:400]}...\n\n"
    out += "Note: Informational only ‚Äî not medical advice."
    return out


In [8]:
safe_answer_rerank("What is paracetamol?")


'Here‚Äôs what I found:\n\n[source 327] The major companies are represented at EU level by the trade association European Association of Euro-Pharmaceutical Companies (EAEPC ), but there are a relatively large number of others holding licences about whom less i s known [Source: Europe Economics Safe Medicines Through P arallel Trade Contribution to an Impact Assessment] Paramedicines medicines (Parapharmaceuticals) (Parapharmaceuticals)...\n\n[source 380] the medicine is intended for \x02 out-patients but its use may produce very serious adverse reactions requiring a prescription drawn up as required by a specialist and special supervision throughout the treatment...\n\n[source 147] See also: \x02 compounding, magistral formula (extemporaneous pre paration), officinal formula [Source: Brion F, Nunn AJ, Rieutord A. Extemporaneo us (magistral) preparation of oral medicines for children in European Hospitals Acta Paediatr...\n\n[source 272] [Source: PPRI Glossary] Medical Device Medical D

In [9]:
# Improved retrieval with synonym expansion + substring fallback + fuzzy boost
!pip install -q rapidfuzz

from rapidfuzz import fuzz, process
import re, numpy as np, textwrap

# 1) small, editable synonym map (add more mappings as you discover names)
SYNONYM_MAP = {
    "paracetamol": ["acetaminophen", "panadol", "crocin", "calpol", "acetaminophen/paracetamol"],
    # add more: "ibuprofen": ["advil","brufen"], ...
}

# helper: get synonyms for tokens appearing in the query
def expand_query_with_synonyms(query):
    qlower = query.lower()
    extra = []
    for canonical, aliases in SYNONYM_MAP.items():
        if canonical in qlower or any(a in qlower for a in aliases):
            # include both canonical and aliases
            extra.extend([canonical] + aliases)
        else:
            # if query contains a brand that maps to canonical, also include it
            for a in aliases:
                if a in qlower:
                    extra.extend([canonical] + aliases)
    # unique preserve order
    seen = set()
    extras = [x for x in extra if not (x in seen or seen.add(x))]
    return extras

# 2) improved retrieve wrapper that uses synonyms
def improved_retrieve_with_synonyms(query, top_k=5, fetch_k=120, boost_exact=True):
    # primary: re-ranker on original query
    hits = improved_retrieve(query, top_k=top_k, fetch_k=fetch_k, boost_exact=boost_exact)
    if hits and len(hits) > 0:
        # if top result looks noisy (long unrelated header), we still consider fallbacks below
        top_text = hits[0]['text'].lower()
        # quick heuristic: if top result length < 40 or doesn't contain key tokens, continue to fallback
        if len(top_text) >= 40 and any(tok in top_text for tok in re.findall(r"[A-Za-z0-9\-\+]+", query.lower())):
            return hits

    # 2a) synonym expansion
    synonyms = expand_query_with_synonyms(query)
    for syn in synonyms:
        q_syn = f"{query} {syn}"
        hits_syn = improved_retrieve(q_syn, top_k=top_k, fetch_k=fetch_k, boost_exact=boost_exact)
        if hits_syn:
            return hits_syn

    # 2b) substring fallback: prefer chunks that contain the canonical token or any synonyms
    tokens = re.findall(r"[A-Za-z0-9\-\+]+", query.lower())
    search_terms = tokens + synonyms
    # find chunks that contain any search term
    matches = []
    for i, c in enumerate(chunks):
        low = c.lower()
        score = 0
        for t in search_terms:
            if t and re.search(rf"\b{re.escape(t)}\b", low):
                score += 1.0
        if score>0:
            matches.append((i, score, c))
    # if matches found, rank them by score then fuzzy similarity
    if matches:
        ranked = sorted(matches, key=lambda x: (x[1], fuzz.partial_ratio(" ".join(tokens), x[2])), reverse=True)
        results = [{"idx": r[0], "score": float(r[1]), "text": r[2]} for r in ranked[:top_k]]
        return results

    # 2c) fuzzy-best fallback: find the single chunk with highest fuzzy ratio to query tokens
    best = process.extractOne(query, chunks, scorer=fuzz.partial_ratio)
    if best and best[1] >= 55:  # threshold, tune if needed
        idx = chunks.index(best[0])
        return [{"idx": idx, "score": float(best[1]), "text": best[0]}]

    # return original hits (could be empty)
    return hits

# 3) safe answer wrapper using improved retrieve with synonyms
def safe_answer_with_synonyms(query, k=4):
    med_kw = ["dose","dosage","prescribe","prescription","should i","how much",
              "pregnant","tablet","mg","ml","take"]
    if any(w in query.lower() for w in med_kw):
        return "‚ö†Ô∏è I cannot provide dosage, prescription, or diagnostic advice. Please consult a licensed pharmacist or doctor."

    hits = improved_retrieve_with_synonyms(query, top_k=k, fetch_k=150, boost_exact=True)
    if not hits:
        return "No relevant information found in the knowledge base."

    out = "Here‚Äôs what I found:\n\n"
    for h in hits:
        s = h['text'].replace("\n"," ").strip()
        # trim to sentence boundary for readability
        if len(s) > 350:
            cut = s.rfind('.', 0, 350)
            s = s[:cut+1] if cut>40 else s[:347] + "..."
        out += f"[source {h['idx']}] {s}\n\n"
    out += "Note: Informational only ‚Äî not medical advice."
    return out

# 4) Quick tests (run these)
tests = ["What is paracetamol?", "What is acetaminophen?", "What is Panadol?", "Define excipient", "How should paracetamol be stored?"]
for t in tests:
    print("\nQ:", t)
    print(textwrap.fill(safe_answer_with_synonyms(t, k=3), width=120))


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h
Q: What is paracetamol?
Here‚Äôs what I found:  [source 327] The major companies are represented at EU level by the trade association European
Association of Euro-Pharmaceutical Companies (EAEPC ), but there are a relatively large number of others holding
licences about whom less i s known [Source: Europe Economics Safe Medicines Through P arallel Trade Contribution to an
Impact Assessment] Paramedicines ...  [source 380] the medicine is intended for  out-patients but its use may produce
very serious adverse reactions requiring a prescription drawn up as required by a specialist and special supervision
throughout the treatment  [source 147] See also:  compounding, magistral formula (extemporaneous pre paration),
officinal formula [Source: Brion F, Nunn AJ, Rieutord A. Extemporaneo us

In [10]:
# Cell: Launch Gradio UI wired to safe_answer_with_synonyms
!pip install -q gradio

import gradio as gr

def gr_chat(user_msg):
    return safe_answer_with_synonyms(user_msg, k=4)

iface = gr.Interface(
    fn=gr_chat,
    inputs="text",
    outputs="text",
    title="üíä Pharmabot (Final ‚Äî Synonyms + Rerank)",
    description="Pharmacy dictionary retrieval (refuses dosage/prescription)."
)
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://109ea5f79181296893.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [11]:
# Test the final function locally (no Gradio)
tests = [
    "What is paracetamol?",
    "What is acetaminophen?",
    "What is Panadol?",
    "How should paracetamol be stored?",
    "Define excipient."
]
for t in tests:
    print("\nQ:", t)
    print(safe_answer_with_synonyms(t, k=3))



Q: What is paracetamol?
Here‚Äôs what I found:

[source 327] The major companies are represented at EU level by the trade association European Association of Euro-Pharmaceutical Companies (EAEPC ), but there are a relatively large number of others holding licences about whom less i s known [Source: Europe Economics Safe Medicines Through P arallel Trade Contribution to an Impact Assessment] Paramedicines ...

[source 380] the medicine is intended for  out-patients but its use may produce very serious adverse reactions requiring a prescription drawn up as required by a specialist and special supervision throughout the treatment

[source 147] See also:  compounding, magistral formula (extemporaneous pre paration), officinal formula [Source: Brion F, Nunn AJ, Rieutord A. Extemporaneo us (magistral) preparation of oral medicines for children in European Hospitals Acta Paediatr

Note: Informational only ‚Äî not medical advice.

Q: What is acetaminophen?
Here‚Äôs what I found:

[source 38

In [12]:
# Save notebook artifacts & cleaned KB to Drive folder
import shutil, os
dst = "/content/drive/MyDrive/pharmabot_final"
os.makedirs(dst, exist_ok=True)
src = "/content/pharmabot_work"
for fn in os.listdir(src):
    shutil.copy2(os.path.join(src, fn), os.path.join(dst, fn))
print("Copied cleaned KB to", dst)

# Optionally save the notebook file from Colab UI: File -> Save a copy in Drive


Copied cleaned KB to /content/drive/MyDrive/pharmabot_final


In [13]:
#LLM RAG synthesis (OpenAI)
# A1: Install OpenAI client if needed
!pip install -q openai

# A2: RAG synth cell ‚Äî run this and provide API key when asked
import os, openai, textwrap
from getpass import getpass

# Set API key securely (Colab):
if "OPENAI_API_KEY" not in os.environ:
    key = getpass("Paste your OpenAI API key (it will not be visible): ")
    os.environ["OPENAI_API_KEY"] = key

openai.api_key = os.environ.get("OPENAI_API_KEY")

# Compose prompt using top-k retrieved passages (preserves sources)
def build_rag_prompt(question, retrieved):
    ctx = ""
    for i, p in enumerate(retrieved, start=1):
        # include a short context per passage
        ctx += f"[source {p['idx']}] {p['text']}\n\n"
    prompt = f"""
You are a careful pharmacy assistant. Use ONLY the facts in the Context to answer the user's question.
If the user asks for dosage, prescription, or diagnosis, refuse and recommend consulting a licensed pharmacist or doctor.

Context:
{ctx}

Question: {question}

Write a short (2-4 sentences) factual answer. Cite sources inline using the form [source 1], [source 2], etc. If you can't answer from the context, say you can't find it and recommend consulting a pharmacist.
"""
    return prompt

# RAG wrapper
def rag_answer_openai(question, top_k=3):
    # safety first
    med_kw = ["dose","dosage","prescribe","prescription","should i","how much",
              "pregnant","tablet","mg","ml","take"]
    if any(w in question.lower() for w in med_kw):
        return "‚ö†Ô∏è I cannot provide dosage, prescription, or diagnostic advice. Please consult a licensed pharmacist or doctor."

    # get top passages (we use improved_retrieve_with_synonyms if available)
    try:
        retrieved = improved_retrieve_with_synonyms(question, top_k=top_k, fetch_k=150, boost_exact=True)
    except NameError:
        # fallback to safe_answer_with_synonyms logic
        retrieved = improved_retrieve(query=question, top_k=top_k, fetch_k=150, boost_exact=True)

    if not retrieved:
        return "I couldn't find relevant information in the knowledge base. Please consult a pharmacist."

    prompt = build_rag_prompt(question, retrieved[:top_k])

    # call OpenAI ChatCompletion (choose model available to you)
    try:
        resp = openai.ChatCompletion.create(
            model="gpt-4o-mini",  # replace with your available model if needed
            messages=[{"role":"user","content": prompt}],
            max_tokens=220, temperature=0.0
        )
        text = resp["choices"][0]["message"]["content"].strip()
        return text
    except Exception as e:
        # If model not available, try text completion fallback
        try:
            resp2 = openai.Completion.create(model="text-davinci-003", prompt=prompt, max_tokens=220, temperature=0.0)
            return resp2.choices[0].text.strip()
        except Exception as e2:
            return f"OpenAI call failed: {e} / {e2}"

# Quick local test
print("RAG test (OpenAI). This will call OpenAI for 1 request.")
q = "What is paracetamol?"
print(rag_answer_openai(q))


Paste your OpenAI API key (it will not be visible): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
RAG test (OpenAI). This will call OpenAI for 1 request.
OpenAI call failed: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
 / 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration g

In [14]:
#Expand synonym dictionary & wire Gradio to final retriever + RAG option
# B1: install gradio if needed
!pip install -q gradio

# B2: extend the synonym map (starter list ‚Äî add more rows as you discover)
SYNONYM_MAP.update({
    "ibuprofen": ["advil","brufen","motrin","nurofen"],
    "aspirin": ["acetylsalicylic acid","asa","disprin"],
    "paracetamol": ["acetaminophen", "panadol", "crocin", "calpol", "acetaminophen/paracetamol"],
    "amoxicillin": ["mox", "amoxil", "amoxycillin"],
    # add more pairs here...
})

# B3: Gradio UI integrated with RAG option
import gradio as gr

def gr_chat_interface(user_msg, mode="Retrieval only (fast)"):
    if mode.startswith("RAG"):
        return rag_answer_openai(user_msg, top_k=3)
    else:
        return safe_answer_with_synonyms(user_msg, k=4)

iface = gr.Interface(
    fn=gr_chat_interface,
    inputs=[gr.Textbox(lines=2, placeholder="Ask about a medicine or term..."), gr.Radio(["Retrieval only (fast)", "RAG (LLM synthesis ‚Äî needs OpenAI key)"], value="Retrieval only (fast)")],
    outputs="text",
    title="üíä Pharmabot (Final) ‚Äî Retrieval + RAG",
    description="Choose mode: Retrieval-only uses your glossary; RAG calls OpenAI to synthesize answers (requires API key). Always refuses dosage/prescription."
)
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://873aad3f578b1c7016.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [15]:
#Intent classifier (train tiny model + integrate into pipeline)
# C1: install training libs
!pip install -q scikit-learn

# C2: build a small training set and train a simple TF-IDF + LogisticRegression classifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

# tiny labeled dataset (expand later)
train_texts = [
    "hi", "hello", "good morning", "hey there", "how are you",
    "what is paracetamol", "define excipient", "what is ibuprofen used for", "explain pharmaceutical equivalence",
    "what dose of paracetamol should i take", "how much ibuprofen can i give to a child", "can i take this medicine while pregnant?",
    "i want to buy paracetamol", "do you sell medicines", "where to buy crocin"
]
train_labels = [
    "greeting","greeting","greeting","greeting","greeting",
    "info","info","info","info",
    "medical_advice","medical_advice","medical_advice",
    "product_query","product_query","product_query"
]

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=400))
])
pipeline.fit(train_texts, train_labels)
joblib.dump(pipeline, "/content/pharmabot_work/intent_classifier.joblib")
print("Trained intent classifier and saved to /content/pharmabot_work/intent_classifier.joblib")

# C3: integrate into safe_answer wrapper
import joblib
clf = joblib.load("/content/pharmabot_work/intent_classifier.joblib")

def classify_intent(query):
    return clf.predict([query])[0]

# Example use:
for q in ["hello", "what dose of paracetamol should i take", "what is paracetamol"]:
    print(q, "->", classify_intent(q))

# C4: Update safe wrapper to consult classifier
def safe_answer_with_intent(query, k=4):
    intent = classify_intent(query)
    if intent == "greeting":
        return "Hello! I can help with pharmacy definitions and storage info. I cannot provide dosage or prescription advice."
    if intent == "medical_advice":
        return "‚ö†Ô∏è I cannot provide dosage, prescription, or diagnostic advice. Please consult a licensed pharmacist or doctor."
    if intent == "product_query":
        # redirect to info but avoid dosage
        return safe_answer_with_synonyms(query, k=k)
    # default -> info retrieval + optional RAG
    return safe_answer_with_synonyms(query, k=k)

# Test:
print("\nTest safe_answer_with_intent:")
print(safe_answer_with_intent("hello"))
print(safe_answer_with_intent("what dose of paracetamol should i take"))
print(safe_answer_with_intent("what is paracetamol"))



Trained intent classifier and saved to /content/pharmabot_work/intent_classifier.joblib
hello -> greeting
what dose of paracetamol should i take -> medical_advice
what is paracetamol -> info

Test safe_answer_with_intent:
Hello! I can help with pharmacy definitions and storage info. I cannot provide dosage or prescription advice.
‚ö†Ô∏è I cannot provide dosage, prescription, or diagnostic advice. Please consult a licensed pharmacist or doctor.
Here‚Äôs what I found:

[source 327] The major companies are represented at EU level by the trade association European Association of Euro-Pharmaceutical Companies (EAEPC ), but there are a relatively large number of others holding licences about whom less i s known [Source: Europe Economics Safe Medicines Through P arallel Trade Contribution to an Impact Assessment] Paramedicines ...

[source 380] the medicine is intended for  out-patients but its use may produce very serious adverse reactions requiring a prescription drawn up as required by a 

In [16]:
# Save artifacts (cleaned KB + models) to Drive folder
import shutil, os
dst = "/content/drive/MyDrive/pharmabot_final"
os.makedirs(dst, exist_ok=True)

# copy cleaned KB & index
src = "/content/pharmabot_work"
for fn in os.listdir(src):
    shutil.copy2(os.path.join(src, fn), os.path.join(dst, fn))
print("Copied files to", dst)

# Save a short README
readme = """Pharmabot final artifacts
- cleaned_chunks.csv
- cleaned_embs.npy
- faiss_cleaned.bin
- intent_classifier.joblib

How to run:
1. Load cleaned_chunks.csv and faiss index.
2. Load models: all-MiniLM-L6-v2 and cross-encoder/ms-marco-MiniLM-L-6-v2.
3. Use safe_answer_with_intent(query) for safe retrieval; use rag_answer_openai(query) for LLM synthesis.
"""
with open(os.path.join(dst, "README.txt"), "w") as f:
    f.write(readme)

print("Saved README to", dst)


Copied files to /content/drive/MyDrive/pharmabot_final
Saved README to /content/drive/MyDrive/pharmabot_final
