In [3]:
import os
os.environ["CHROMA_TELEMETRY"] = "FALSE"  # must be set before chromadb import

import chromadb

client = chromadb.PersistentClient(
    path="/Users/adithyakatari/Desktop/suchitra/chroma_db",
    tenant="default_tenant",
    database="default_database",
)

print("Collections:", [c.name for c in client.list_collections()])


Collections: ['cdc_diseases']


In [4]:
from chromadb.utils import embedding_functions

embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

collection = client.get_collection(
    "cdc_diseases",
    embedding_function=embedding_fn
)

print("Count:", collection.count())


Count: 3694


In [5]:
import re

def rerank_chunks(query, documents, metadatas, top_k=5):
    query = query.lower()
    keywords = set(re.findall(r"\w+", query))

    scored = []

    for doc, meta in zip(documents, metadatas):
        text = doc.lower()
        text_words = set(re.findall(r"\w+", text))

        # keyword overlap score
        keyword_score = len(keywords.intersection(text_words))

        # boost if prevention-related
        boost = 2 if meta.get("has_prevention") else 0

        final_score = keyword_score + boost
        scored.append((final_score, doc, meta))

    # sort by score (descending)
    scored.sort(key=lambda x: x[0], reverse=True)

    return scored[:top_k]

In [10]:
def build_context_from_reranked(
    reranked,
    max_tokens=1800
):
    """
    Converts reranked chunks into a single LLM-ready context.

    reranked: list of (score, text, metadata)
    max_tokens: approximate token budget

    Returns:
    - context (str)
    - sources (list of dicts with url + disease)
    """

    context_blocks = []
    sources = []
    used_tokens = 0

    for idx, (score, text, meta) in enumerate(reranked, start=1):

        # rough token estimate (good enough for notebooks)
        token_count = len(text.split())

        if used_tokens + token_count > max_tokens:
            break

        block = f""" [Source {idx}] Disease: {meta.get("disease")} URL: {meta.get("url")} {text} """.strip()

        context_blocks.append(block)
        sources.append({
            "source": f"Source {idx}",
            "disease": meta.get("disease"),
            "url": meta.get("url")
        })

        used_tokens += token_count

    context = "\n\n".join(context_blocks)
    return context, sources


In [20]:
def build_prompt(context, question):
    return f"""
You are a health information assistant.

Answer the question using ONLY the CDC context below.
If the answer is not fully supported by the context, say you do not have enough information.

Rules:
- Be concise and factual
- Do NOT add external knowledge
- Use multiple sources if they provide relevant information
- Cite each factual claim using (Source 1), (Source 2), etc.
- Do not invent sources
- This is general information, not medical advice

CDC CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
""".strip()



In [22]:
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv()

client = OpenAI(
    api_key=os.getenv("GROQ_API_KEY"),
    base_url="https://api.groq.com/openai/v1",
)

def generate_answer(prompt):
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",  # Groq model
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )
    return response.choices[0].message.content


In [28]:
def ask_cdc(question, top_k=5):
    # 1. Query Chroma
    results = collection.query(
        query_texts=[question],
        n_results=10,
        include=["documents", "metadatas"]
    )

    docs = results["documents"][0]
    metas = results["metadatas"][0]

    # 2. Rerank
    reranked = rerank_chunks(question, docs, metas, top_k=top_k)

    # 3. Build context
    context, sources = build_context_from_reranked(reranked)

    # 4. Build prompt
    prompt = build_prompt(context, question)

    # 5. Generate answer
    answer = generate_answer(prompt)

    return answer, sources


In [None]:
question = "what are the possible causes of rapid weight gain"
answer, sources = ask_cdc(question)

print("ANSWER:\n")
print(answer)

print("\nSOURCES:\n")
for s in sources:
    print(f"{s['source']}: {s['disease']} — {s['url']}")


ANSWER:

Based on the provided CDC context, the possible causes of rapid weight gain are:

1. Insulin resistance (Source 1): When your body is exposed to too much blood sugar over an extended period, you can develop insulin resistance. This can lead to high blood sugar levels, which causes your pancreas to release more insulin, resulting in weight gain.
2. Overweight or obesity (Source 2, Source 3): Being overweight or having obesity increases your risk of getting cancer and is associated with a higher risk of getting 13 types of cancer. It can also lead to long-lasting inflammation and higher than normal levels of insulin, insulin-like growth factor, and sex hormones, which may lead to cancer.
3. High blood sugar levels (Source 1): High blood sugar levels can cause your body to store extra sugar in your liver and muscles, and when they're full, the liver sends the remaining sugar to be stored as body fat, causing weight gain.
4. High triglycerides (Source 1): High triglycerides can al