1. Install dependencies

In [None]:
!pip install --quiet pandas numpy tqdm sentence-transformers faiss-cpu transformers accelerate bitsandbytes ipywidgets


2. Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import ipywidgets as widgets
from IPython.display import display


3. Load your CSV and build knowledge base

In [None]:
df = pd.read_csv("symptom_precaution.csv")
df.head()


Build KB:

In [None]:
kb_rows = []
for _, row in df.iterrows():
    disease = str(row["Disease"])
    precs = [str(row[c]) for c in ["Precaution_1", "Precaution_2", "Precaution_3", "Precaution_4"]]
    precs = [p for p in precs if p and p.lower() != "nan"]
    chunk = f"Disease: {disease}\nSelf-care precautions:\n- " + "\n- ".join(precs)
    kb_rows.append({
        "topic": disease,
        "section": "precautions",
        "chunk": chunk
    })

kb_df = pd.DataFrame(kb_rows)
kb_df.head()


4. Embeddings + FAISS index

In [None]:
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = kb_df["chunk"].tolist()
embeddings = embed_model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
).astype("float32")

faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
print("Total indexed vectors:", index.ntotal)


Retriever:

In [None]:
def retrieve(query, k=5):
    q_emb = embed_model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q_emb)
    scores, indices = index.search(q_emb, k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        row = kb_df.iloc[idx]
        results.append({
            "score": float(score),
            "topic": row["topic"],
            "section": row["section"],
            "chunk": row["chunk"]
        })
    return results


Quick test:

In [None]:
retrieve("precautions for dengue", k=3)


5. Helper to load a model

In [None]:
def load_llm(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype="auto"
    )
    gen = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=160,      # keep small for speed
        do_sample=False,         # deterministic
        temperature=0.0,
        top_p=1.0
    )
    return tokenizer, gen


6. LLMs

In [None]:
models = {
    "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct",
    "TinyLlama-1.1B": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "Qwen-1.8B-Chat": "Qwen/Qwen1.5-1.8B-Chat"
}

llms = {}
for name, model_id in models.items():
    print(f"Loading {name} -> {model_id}")
    tokenizer, gen = load_llm(model_id)
    llms[name] = {"tokenizer": tokenizer, "generator": gen}
print("All models loaded.")


7. RAG answer function with safety

In [None]:
SYSTEM_PROMPT = (
    "You are SafeSymp, a careful health assistant. "
    "Use only the provided context about diseases and precautions. "
    "Do NOT diagnose diseases. Do NOT prescribe specific medications. "
    "Give general self-care advice from the context and suggest seeing a doctor "
    "for serious or persistent symptoms."
)

def rag_answer(query, tokenizer, generator, k=3):
    docs = retrieve(query, k=k)
    context = "\n\n".join([d["chunk"] for d in docs])

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"}
    ]

    # Build chat-style prompt if supported
    try:
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception:
        # Fallback: simple concatenation
        prompt = (
            SYSTEM_PROMPT + "\n\n" +
            f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
        )

    output = generator(prompt)[0]["generated_text"]

    # Try to strip the prompt and keep only assistant answer
    try:
        assistant_start = tokenizer.apply_chat_template(
            [{"role": "assistant", "content": ""}],
            tokenize=False,
            add_generation_prompt=False
        )
        if assistant_start in output:
            answer = output.split(assistant_start)[-1].strip()
        else:
            answer = output[len(prompt):].strip()
    except Exception:
        answer = output[len(prompt):].strip()

    answer = answer.replace("<|end|>", "").strip()
    return answer, docs


8. Define at least 10 domain questions

In [None]:
questions = [
    "I have a bad cough and runny nose. What should I do?",
    "What home precautions are recommended for malaria?",
    "What self-care steps can I follow for allergy symptoms?",
    "How can I manage hypothyroidism safely at home?",
    "What precautions should I follow if I have psoriasis?",
    "What self-care measures should I take for GERD?",
    "What precautions are recommended for chronic cholestasis?",
    "What should I do at home if I have hepatitis A?",
    "What precautions can help with osteoarthritis pain?",
    "How can I manage hypoglycemia symptoms at home?"
]


9. Run evaluation over 3 models

In [None]:
import time # Add this line to import the time module

results = []

# Initialize a dictionary to store all necessary data for batch_results
batch_results_detailed = {name: {'responses': [], 'docs': [], 'latencies': [], 'tokens': []} for name in llms.keys()}

for q in questions:
    print(f"QUESTION: {q}\n")
    row = {"question": q}
    for name, llm in llms.items():
        print(f"--- {name} ---")
        # Capture start time before generation
        start_time = time.time()
        ans, docs = rag_answer(q, llm["tokenizer"], llm["generator"], k=3)
        end_time = time.time()

        # Token count approximation (simple char count / avg chars per token)
        # A more accurate count would use tokenizer.encode(ans) or generator's output
        tokens_generated = len(llm["tokenizer"].encode(ans))

        print(ans, "\n")
        row[name] = ans

        # Store detailed results for analysis
        batch_results_detailed[name]['responses'].append(ans)
        batch_results_detailed[name]['docs'].append([d['chunk'] for d in docs]) # Store only chunks from docs
        batch_results_detailed[name]['latencies'].append(end_time - start_time)
        batch_results_detailed[name]['tokens'].append(tokens_generated)

    results.append(row)
    print("=" * 80)

# After the loop, assign batch_results_detailed to batch_results
batch_results = batch_results_detailed

Save for your report:

In [None]:
eval_df = pd.DataFrame(results)
eval_df.to_csv("rag_three_llms_results.csv", index=False)
eval_df


##  Detailed Analysis of Model Differences

In this section, we quantitatively and qualitatively analyze the three LLMs based on the batch evaluation results from the 10 domain-specific questions.

- **Performance**: Measured by average generation latency (seconds per response) and token efficiency (output tokens per query).
- **Accuracy**: Assessed via semantic similarity (cosine similarity using sentence embeddings) between generated responses and retrieved documents, averaged across queries. Higher scores indicate better factual grounding.
- **Approach**: Qualitative summary of response styles (e.g., verbosity, structure, hallucination tendency), derived from patterns in outputs. This is semi-automated via keyword analysis for structure (e.g., presence of lists/bullets) and length.

We use the stored batch results (assuming a `batch_results` dict from the evaluation loop: `{model_name: {'responses': [list of 10 strs], 'docs': [list of doc lists], 'latencies': [list of 10 floats], 'tokens': [list of 10 ints]}}`). If not already captured, the code below retroactively simulates/computes them if needed.

This analysis highlights differences: e.g., smaller models may be faster but less accurate on nuanced symptoms.

In [None]:
# Install if needed (in Colab: !pip install sentence-transformers nltk), but assume available
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize
import time # Import time for latency calculation

nltk.download('punkt_tab', quiet=True)  # For sentence tokenization

# Load embedding model (lightweight for similarity)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# batch_results will now be populated by the evaluation loop (cell BbTZVQn6U8tl)
# If running this cell independently for testing, a placeholder can be used, but
# it's intended to follow the evaluation loop execution.
# if 'batch_results' not in globals():
#     print("Warning: batch_results not found. Please run the evaluation cell first.")
#     # Placeholder for testing if eval cell hasn't run
#     batch_results = {name: {
#         'responses': [f"Sample response {i} for {name}" for i in range(10)],
#         'docs': [[f"Sample doc {i}"]] * 10, # Simulating context for each response
#         'latencies': np.random.uniform(1, 5, 10).tolist(),
#         'tokens': np.random.randint(50, 200, 10).tolist()
#     } for name in llms.keys()}

def compute_accuracy(model_name):
    """Compute avg cosine similarity between responses and concatenated docs."""
    data = batch_results[model_name]
    scores = []
    for resp, doc_list in zip(data['responses'], data['docs']):
        if not doc_list or len(doc_list) == 0: # Check if doc_list is empty
            scores.append(0.0)
            continue
        # Concat docs for simplicity
        combined_doc = ' '.join(doc_list)
        if len(resp.strip()) == 0 or len(combined_doc.strip()) == 0:
            scores.append(0.0)
            continue
        resp_emb = embedder.encode(resp)
        doc_emb = embedder.encode(combined_doc)
        scores.append(cosine_similarity([resp_emb], [doc_emb])[0][0])
    return np.mean(scores)

def analyze_approach(model_name):
    """Qualitative approach metrics: verbosity (avg sentences), structure (fraction with lists), hallucination proxy (low sim if <0.5)."""
    data = batch_results[model_name]
    responses = data['responses']
    sim_score = compute_accuracy(model_name)

    # Verbosity: average number of sentences
    verbosity = np.mean([len(sent_tokenize(resp)) for resp in responses if resp.strip()]) if responses else 0.0

    # Structure: fraction of responses containing list/bullet characters
    structure_score = np.mean([1 if 'â€¢' in resp or '-' in resp or '*' in resp else 0 for resp in responses])

    # Hallucination Tendency: based on sim_score. If sim_score is 0 (due to empty docs), this will be 'High'.
    # Note: A score of 0.0 here likely means no grounding context was available during calculation.
    hallucination_tendency = 'High' if sim_score < 0.5 else 'Low' if sim_score > 0.7 else 'Medium'

    return {
        'Avg Sentences': round(verbosity, 1),
        'Structure Score (0-1)': round(structure_score, 2),
        'Hallucination Tendency': hallucination_tendency
    }

# Compute metrics for all models
metrics = []
for name in llms.keys():
    data = batch_results[name]
    avg_latency = np.mean(data['latencies'])
    avg_tokens = np.mean(data['tokens'])
    avg_accuracy = compute_accuracy(name)
    approach = analyze_approach(name)

    metrics.append({
        'Model': name,
        'Avg Latency (s)': round(avg_latency, 2),
        'Avg Tokens': round(avg_tokens),
        'Avg Accuracy (Sim)': round(avg_accuracy, 3),
        **approach
    })

# Display comparison table
comparison_df = pd.DataFrame(metrics)
print("### Model Comparison Table")
display(comparison_df)

# Optional: Bar plot for key metrics
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

metrics_to_plot = ['Avg Latency (s)', 'Avg Accuracy (Sim)', 'Avg Sentences']
for i, metric in enumerate(metrics_to_plot):
    axs[i].bar([m['Model'] for m in metrics], [m[metric] for m in metrics])
    axs[i].set_title(metric)
    axs[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Qualitative Summary (printed)
print("\n### Qualitative Approach Differences")
for name in llms.keys():
    approach = analyze_approach(name)
    print(f"\n{name}:\n  - {approach['Hallucination Tendency']} hallucination (based on doc similarity - **NOTE: Accuracy now reflects whether grounding context was available and used.**).")
    print(f"  - {'Concise' if approach['Avg Sentences'] < 5 else 'Verbose'} style (avg {approach['Avg Sentences']} sentences/query).")
    print(f"  - {'Structured' if approach['Structure Score (0-1)'] > 0.5 else 'Narrative'} responses ({approach['Structure Score (0-1)']*100:.0f}% with lists).")

10. Simple interactive demo UI

In [None]:
q_input = widgets.Text(
    placeholder="Enter a symptom question...",
    description="Query:",
    layout=widgets.Layout(width="100%")
)
out_box = widgets.Output()

def on_submit(change):
    out_box.clear_output()
    with out_box:
        q = change["new"].strip()
        if not q:
            return
        print(f"Query: {q}\n")
        for name, llm in llms.items():
            print(f"===== {name} =====")
            ans, docs = rag_answer(q, llm["tokenizer"], llm["generator"], k=3)
            print(ans, "\n")

q_input.observe(on_submit, names="value")
display(q_input, out_box)
