In [None]:
"""
NOTEBOOK 05: Q&A GENERATION WITH LLM
Integrate Google Gemini for intelligent answer synthesis with citations
"""

# Install Google Generative AI
!pip install -q google-generativeai chromadb

import os
import json
import pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer
import chromadb
import google.generativeai as genai

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
PROJECT_ROOT = "/content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project"
VECTOR_STORE_DIR = f"{PROJECT_ROOT}/03_knowledge_base/outputs/vector_store"
OUTPUT_DIR = f"{PROJECT_ROOT}/05_qa_generation/outputs"
QA_DIR = f"{OUTPUT_DIR}/qa_pairs_validation"

os.makedirs(QA_DIR, exist_ok=True)

# Load patient metadata
with open(f"{PROJECT_ROOT}/01_data_generation/outputs/patient_metadata.json", 'r') as f:
    metadata = json.load(f)

patients = metadata['patients']

# Load embedding model and vector store
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(MODEL_NAME)

client = chromadb.PersistentClient(path=VECTOR_STORE_DIR)
collection = client.get_collection("clinical_notes")

print(f"✅ Setup complete")
print(f"📊 Vector store: {collection.count()} chunks")
print(f"📊 Patients: {len(patients)}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Setup complete
📊 Vector store: 288 chunks
📊 Patients: 10


In [None]:
# Install transformers for local LLM
!pip install -q transformers torch accelerate

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("📥 Loading TinyLlama 1.1B locally (CPU-optimized)...")
print("   This downloads quickly (~2GB model)")
print("   Optimized for Google Colab FREE CPU\n")

# Load TinyLlama with CPU optimizations
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # float32 for CPU (not float16)
    low_cpu_mem_usage=True,     # Optimize CPU memory
    device_map="cpu"            # Force CPU usage
)

print("✅ TinyLlama 1.1B loaded successfully")
print("💻 Running on CPU (optimized for Colab free tier)")
print("🔒 100% local - NO external API calls")
print("⏱️  Expected answer time: 30-60 seconds per query\n")

# Test the model
test_prompt = "<|user|>\nSay 'Hello, I am ready!'</s>\n<|assistant|>\n"
test_inputs = tokenizer(test_prompt, return_tensors="pt")

print("🧪 Testing model (this takes ~30 sec on CPU)...")
with torch.no_grad():
    test_output = llm_model.generate(
        **test_inputs,
        max_new_tokens=30,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    test_response = tokenizer.decode(test_output[0][test_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print(f"🤖 Model response: {test_response}")
print("✅ Model ready for Q&A!")

📥 Loading TinyLlama 1.1B locally (CPU-optimized)...
   This downloads quickly (~2GB model)
   Optimized for Google Colab FREE CPU



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ TinyLlama 1.1B loaded successfully
💻 Running on CPU (optimized for Colab free tier)
🔒 100% local - NO external API calls
⏱️  Expected answer time: 30-60 seconds per query

🧪 Testing model (this takes ~30 sec on CPU)...
🤖 Model response: ¡Hola, estoy listo!
✅ Model ready for Q&A!


In [None]:
# System prompt for medical Q&A
SYSTEM_PROMPT = """You are a medical assistant helping patients understand their clinical notes.

CRITICAL RULES:
1. ONLY use information from the provided clinical note excerpts
2. Cite the visit date and section for EVERY claim you make
3. If information is not in the provided notes, clearly state "I don't have this information in your records"
4. Explain medical terms in patient-friendly language
5. Synthesize information across multiple visits when relevant
6. DO NOT make up information or use external medical knowledge

CITATION FORMAT:
- Use: [Visit: YYYY-MM-DD, Section: SECTION_NAME]
- Example: "You are taking Metformin 500mg [Visit: 2024-05-15, Section: CURRENT MEDICATIONS]"

ANSWER STYLE:
- Be clear, direct, and compassionate
- Explain the "WHY" behind medical decisions when evident
- If lab values are abnormal, mention it and what it might mean based on the notes
"""

def create_rag_prompt(query, retrieved_chunks):
    """Create prompt with query and retrieved context"""

    # Format retrieved chunks
    context = ""
    for i, chunk in enumerate(retrieved_chunks, 1):
        context += f"\n--- Excerpt {i} ---\n"
        context += f"Visit Date: {chunk['visit_date']}\n"
        context += f"Section: {chunk['section']}\n"
        context += f"Content: {chunk['text']}\n"

    prompt = f"""{SYSTEM_PROMPT}

PATIENT'S CLINICAL NOTE EXCERPTS:
{context}

PATIENT'S QUESTION:
{query}

Please provide a helpful answer using ONLY the information from the excerpts above.
Include citations in the format [Visit: YYYY-MM-DD, Section: SECTION_NAME].
"""

    return prompt

# Test prompt generation
test_chunks = [
    {
        'text': 'CURRENT MEDICATIONS:\n  • Metformin 500mg\n  • Lisinopril 10mg',
        'visit_date': '2024-05-15',
        'section': 'CURRENT MEDICATIONS'
    }
]

test_prompt = create_rag_prompt("What medications am I taking?", test_chunks)
print("✅ Prompt templates created")
print(f"\n📝 Sample Prompt (first 400 chars):")
print(test_prompt[:400] + "...")

✅ Prompt templates created

📝 Sample Prompt (first 400 chars):
You are a medical assistant helping patients understand their clinical notes.

CRITICAL RULES:
1. ONLY use information from the provided clinical note excerpts
2. Cite the visit date and section for EVERY claim you make
3. If information is not in the provided notes, clearly state "I don't have this information in your records"
4. Explain medical terms in patient-friendly language
5. Synthesize in...


In [None]:
def rag_qa_pipeline(query, patient_id, top_k=3, similarity_threshold=0.0):
    """
    Complete RAG pipeline: Retrieve → Generate → Cite
    CPU-optimized for Google Colab free tier
    """

    # Step 1: Retrieve relevant chunks
    query_embedding = embedding_model.encode(query)

    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k,
        where={"patient_id": patient_id}
    )

    # Step 2: Filter by similarity threshold
    retrieved_chunks = []
    for doc, metadata, distance in zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ):
        similarity = 1 - distance
        if similarity >= similarity_threshold:
            retrieved_chunks.append({
                'text': doc,
                'visit_date': metadata['visit_date'],
                'section': metadata['section'],
                'similarity': round(similarity, 3)
            })

    # Step 3: Check if we have relevant information
    if not retrieved_chunks:
        return {
            'answer': "I don't have enough information in your clinical notes to answer this question.",
            'chunks_used': 0,
            'citations': [],
            'chunks': []
        }

    # Step 4: Create IMPROVED prompt with clear instructions
    context_parts = []
    for i, chunk in enumerate(retrieved_chunks, 1):
        context_parts.append(
            f"Note {i} - {chunk['visit_date']} ({chunk['section']}):\n{chunk['text'][:400]}"
        )

    context = "\n\n".join(context_parts)

    # IMPROVED PROMPT with explicit instructions
    simple_prompt = f"""You are helping a patient understand their medical records.

CLINICAL NOTES:
{context}

PATIENT QUESTION: {query}

INSTRUCTIONS:
1. Answer the question in a clear, conversational way
2. Combine information from multiple visits if needed
3. Always cite the visit date like this: [Visit: 2024-05-01]
4. Explain what the medications/results mean if relevant
5. Do NOT just copy the notes - explain them
"""

    # print("Prompt given:", simple_prompt,'\n\n')

    try:
        # Format for TinyLlama
        formatted_prompt = f"<|user|>\n{simple_prompt}</s>\n<|assistant|>\n"

        # Tokenize
        inputs = tokenizer(formatted_prompt, return_tensors="pt", max_length=1500, truncation=True)

        print(f"⏳ Generating answer (this takes ~30-60 sec on CPU)...")

        # Generate with CPU-optimized settings
        with torch.no_grad():
            outputs = llm_model.generate(
                **inputs,
                max_new_tokens=300,
                do_sample=True,
                temperature=0.7,  # Increased for more creative synthesis
                top_p=0.9,
                repetition_penalty=1.2,  # Prevent copying
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode
        answer = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        # Clean up common LLM artifacts
        answer = answer.split("PATIENT QUESTION:")[0]  # Remove if it repeats the question
        answer = answer.split("CLINICAL NOTES:")[0]     # Remove if it repeats context
        answer = answer.strip()

    except Exception as e:
        answer = f"Error generating answer: {str(e)}"

    # Step 5: Extract citations
    citations = []
    for chunk in retrieved_chunks:
        citation_str = f"[Visit: {chunk['visit_date']}"
        if citation_str in answer:
            citations.append(f"[Visit: {chunk['visit_date']}, Section: {chunk['section']}]")

    return {
        'answer': answer,
        'chunks_used': len(retrieved_chunks),
        'citations': citations,
        'chunks': retrieved_chunks
    }

# Test the improved pipeline
print("🧪 Testing IMPROVED RAG pipeline...\n")
test_result = rag_qa_pipeline(
    query="What medications am I taking?",
    patient_id="patient_001",
    top_k=3
)

print("✅ RAG pipeline created")
print(f"\n🔍 Test Query: 'What medications am I taking?'")
print(f"👤 Patient: patient_001")
print(f"\n📊 Pipeline Output:")
print(f"  • Chunks retrieved: {test_result['chunks_used']}")
print(f"  • Citations found: {len(test_result['citations'])}")
print(f"\n🤖 Answer:\n{test_result['answer']}")

🧪 Testing IMPROVED RAG pipeline...

Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-05-01 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Montelukast 10mg
  • Fluticasone 250mcg

Note 2 - 2024-09-24 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Rosuvastatin 10mg
  • Atorvastatin 20mg

PATIENT QUESTION: What medications am I taking?

INSTRUCTIONS:
1. Answer the question in a clear, conversational way
2. Combine information from multiple visits if needed
3. Always cite the visit date like this: [Visit: 2024-05-01]
4. Explain what the medications/results mean if relevant
5. Do NOT just copy the notes - explain them

Your answer: 


⏳ Generating answer (this takes ~30-60 sec on CPU)...
✅ RAG pipeline created

🔍 Test Query: 'What medications am I taking?'
👤 Patient: patient_001

📊 Pipeline Output:
  • Chunks retrieved: 2
  • Citations found: 0

🤖 Answer:
To help a patient understand their medical records, here's an example of how 

In [None]:
# Test comprehensive query types
test_queries = {
    "medications": "What medications am I currently taking and why?",
    "lab_results": "What are my recent lab results and are any values abnormal?",
    "diagnosis": "What medical conditions am I being treated for?",
    "treatment_plan": "What is my treatment plan and when should I follow up?",
    "complex": "Looking at my records, which medications am I taking and what do my lab results show about how well they're working?"
}

test_patient_id = "patient_001"

print(f"🧪 Testing RAG pipeline with {len(test_queries)} query types")
print(f"👤 Patient: {test_patient_id}\n")

test_results = []

for category, query in test_queries.items():
    print(f"\n{'='*70}")
    print(f"CATEGORY: {category.upper()}")
    print(f"Query: {query}")
    print(f"{'='*70}")

    result = rag_qa_pipeline(query, test_patient_id, top_k=5)

    print(f"\n📊 Chunks used: {result['chunks_used']}")
    print(f"📎 Citations: {len(result['citations'])}")
    print(f"\n🤖 Answer:\n{result['answer']}\n")

    test_results.append({
        'category': category,
        'query': query,
        'answer': result['answer'],
        'chunks_used': result['chunks_used'],
        'num_citations': len(result['citations'])
    })

print("\n✅ All test queries completed")

🧪 Testing RAG pipeline with 5 query types
👤 Patient: patient_001


CATEGORY: MEDICATIONS
Query: What medications am I currently taking and why?
Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-05-01 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Montelukast 10mg
  • Fluticasone 250mcg

Note 2 - 2024-09-24 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Rosuvastatin 10mg
  • Atorvastatin 20mg

PATIENT QUESTION: What medications am I currently taking and why?

INSTRUCTIONS:
1. Answer the question in a clear, conversational way
2. Combine information from multiple visits if needed
3. Always cite the visit date like this: [Visit: 2024-05-01]
4. Explain what the medications/results mean if relevant
5. Do NOT just copy the notes - explain them

Your answer: 


⏳ Generating answer (this takes ~30-60 sec on CPU)...

📊 Chunks used: 2
📎 Citations: 0

🤖 Answer:
To help a patient understand their medical records, here's how you could provi

In [None]:
# Evaluate answer quality
print("📊 Evaluating Answer Quality...\n")

quality_metrics = []

for patient in patients[:3]:  # Test first 3 patients
    patient_id = patient['patient_id']

    for category, query in test_queries.items():
        result = rag_qa_pipeline(query, patient_id, top_k=5)

        # Quality checks
        has_answer = len(result['answer']) > 20
        has_citations = len(result['citations']) > 0
        used_chunks = result['chunks_used'] > 0
        no_hallucination = "I don't know" not in result['answer'] or result['chunks_used'] == 0

        quality_score = sum([has_answer, has_citations, used_chunks]) / 3

        quality_metrics.append({
            'patient_id': patient_id,
            'category': category,
            'chunks_retrieved': result['chunks_used'],
            'citations_count': len(result['citations']),
            'answer_length': len(result['answer']),
            'has_citations': has_citations,
            'quality_score': round(quality_score, 2)
        })

quality_df = pd.DataFrame(quality_metrics)

print("✅ Quality evaluation complete\n")
print("📊 Quality Metrics Summary:")
print(f"  • Avg chunks per answer: {quality_df['chunks_retrieved'].mean():.1f}")
print(f"  • Avg citations per answer: {quality_df['citations_count'].mean():.1f}")
print(f"  • Avg answer length: {quality_df['answer_length'].mean():.0f} chars")
print(f"  • Answers with citations: {(quality_df['has_citations'].sum() / len(quality_df) * 100):.1f}%")
print(f"  • Avg quality score: {quality_df['quality_score'].mean():.2f}/1.00")

print("\n📊 Quality by Category:")
print(quality_df.groupby('category')['quality_score'].mean().round(2))

📊 Evaluating Answer Quality...

Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-05-01 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Montelukast 10mg
  • Fluticasone 250mcg

Note 2 - 2024-09-24 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Rosuvastatin 10mg
  • Atorvastatin 20mg

PATIENT QUESTION: What medications am I currently taking and why?

INSTRUCTIONS:
1. Answer the question in a clear, conversational way
2. Combine information from multiple visits if needed
3. Always cite the visit date like this: [Visit: 2024-05-01]
4. Explain what the medications/results mean if relevant
5. Do NOT just copy the notes - explain them

Your answer: 


⏳ Generating answer (this takes ~30-60 sec on CPU)...
Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-05-01 (LAB RESULTS:):
LAB RESULTS:
  • Peak Flow: 323.0 [ABNORMAL]

Note 2 - 2024-06-08 (LAB RESULTS:):
LAB RESULTS:
  • HbA1c: 9

In [None]:
# Check if answers are faithful to retrieved chunks (no hallucination)
print("🔍 Running Faithfulness Check (Anti-Hallucination Test)...\n")

faithfulness_tests = []

# Test with queries that should NOT be answerable from the notes
unanswerable_queries = [
    "What is my blood type?",
    "Do I have any allergies to medications?",
    "What did I eat for breakfast today?",
    "What is my insurance provider?",
    "How tall am I?"
]

for patient in patients[:2]:
    patient_id = patient['patient_id']

    for query in unanswerable_queries:
        result = rag_qa_pipeline(query, patient_id, top_k=5)

        # Check if model correctly says it doesn't have the info
        answer_lower = result['answer'].lower()
        admitted_no_info = any(phrase in answer_lower for phrase in [
            "don't have",
            "not found",
            "no information",
            "not available",
            "cannot find"
        ])

        faithfulness_tests.append({
            'patient_id': patient_id,
            'query': query,
            'chunks_retrieved': result['chunks_used'],
            'admitted_no_info': admitted_no_info,
            'answer': result['answer'][:100] + '...'
        })

faithfulness_df = pd.DataFrame(faithfulness_tests)

print("✅ Faithfulness check complete\n")
print("📊 Hallucination Prevention:")
correctly_refused = faithfulness_df['admitted_no_info'].sum()
total_tests = len(faithfulness_df)
print(f"  • Correctly refused to answer: {correctly_refused}/{total_tests} ({correctly_refused/total_tests*100:.1f}%)")

if correctly_refused == total_tests:
    print("  ✅ PASSED: No hallucinations detected")
else:
    print(f"  ⚠️  WARNING: {total_tests - correctly_refused} potential hallucinations")
    print("\nExamples of problematic answers:")
    for _, row in faithfulness_df[~faithfulness_df['admitted_no_info']].head(2).iterrows():
        print(f"\nQuery: {row['query']}")
        print(f"Answer: {row['answer']}")

🔍 Running Faithfulness Check (Anti-Hallucination Test)...

✅ Faithfulness check complete

📊 Hallucination Prevention:
  • Correctly refused to answer: 10/10 (100.0%)
  ✅ PASSED: No hallucinations detected


In [None]:
# Create ground truth Q&A pairs for evaluation
print("📝 Creating Ground Truth Q&A Dataset...\n")

ground_truth_qa = []

# Define expected answers for validation
validation_cases = [
    {
        'patient_id': 'patient_001',
        'query': 'What medications is the patient taking?',
        'expected_keywords': ['metformin', 'medication'],
        'expected_sections': ['CURRENT MEDICATIONS']
    },
    {
        'patient_id': 'patient_001',
        'query': 'What are the recent lab results?',
        'expected_keywords': ['hba1c', 'lab', 'abnormal'],
        'expected_sections': ['LAB RESULTS', 'LABORATORY RESULTS']
    },
    {
        'patient_id': 'patient_002',
        'query': 'What is the primary diagnosis?',
        'expected_keywords': ['diabetes', 'hypertension', 'asthma', 'kidney', 'hyperlipidemia'],
        'expected_sections': ['ASSESSMENT']
    }
]

for case in validation_cases:
    result = rag_qa_pipeline(case['query'], case['patient_id'], top_k=5)

    # Check if answer contains expected keywords
    answer_lower = result['answer'].lower()
    keywords_found = [kw for kw in case['expected_keywords'] if kw in answer_lower]

    # Check if retrieved from expected sections
    sections_retrieved = [chunk['section'] for chunk in result['chunks']]
    expected_section_found = any(
        exp_sec in sec for exp_sec in case['expected_sections']
        for sec in sections_retrieved
    )

    ground_truth_qa.append({
        'patient_id': case['patient_id'],
        'query': case['query'],
        'answer': result['answer'],
        'chunks_used': result['chunks_used'],
        'citations': len(result['citations']),
        'keywords_found': len(keywords_found),
        'expected_section_found': expected_section_found,
        'validation_passed': len(keywords_found) > 0 and expected_section_found
    })

gt_df = pd.DataFrame(ground_truth_qa)

print(f"✅ Created {len(ground_truth_qa)} ground truth Q&A pairs\n")
print("📊 Validation Results:")
print(f"  • Passed: {gt_df['validation_passed'].sum()}/{len(gt_df)}")
print(f"  • Avg keywords found: {gt_df['keywords_found'].mean():.1f}")
print(f"  • Expected sections found: {gt_df['expected_section_found'].sum()}/{len(gt_df)}")

📝 Creating Ground Truth Q&A Dataset...

Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-09-24 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Rosuvastatin 10mg
  • Atorvastatin 20mg

Note 2 - 2024-05-01 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Montelukast 10mg
  • Fluticasone 250mcg

Note 3 - 2024-09-02 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Atorvastatin 20mg

Note 4 - 2024-06-08 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Insulin Glargine

PATIENT QUESTION: What medications is the patient taking?

INSTRUCTIONS:
1. Answer the question in a clear, conversational way
2. Combine information from multiple visits if needed
3. Always cite the visit date like this: [Visit: 2024-05-01]
4. Explain what the medications/results mean if relevant
5. Do NOT just copy the notes - explain them

Your answer: 


⏳ Generating answer (this takes ~30-60 sec on CPU)...
✅ Created 3 ground truth Q&A pairs

📊 Validation Results:
 

In [None]:
# Save all Q&A pairs and metrics
quality_df.to_csv(f"{QA_DIR}/answer_quality_metrics.csv", index=False)
faithfulness_df.to_csv(f"{QA_DIR}/faithfulness_tests.csv", index=False)
gt_df.to_csv(f"{QA_DIR}/ground_truth_qa.csv", index=False)

# Save example Q&A pairs
examples = []
for patient in patients[:2]:
    for category, query in list(test_queries.items())[:3]:
        result = rag_qa_pipeline(query, patient['patient_id'], top_k=3)
        examples.append({
            'patient_id': patient['patient_id'],
            'category': category,
            'query': query,
            'answer': result['answer'],
            'chunks_used': result['chunks_used'],
            'citations': result['citations']
        })

examples_path = f"{QA_DIR}/example_qa_pairs.json"
with open(examples_path, 'w') as f:
    json.dump(examples, f, indent=2)

print(f"✅ Saved outputs to: {QA_DIR}")
print(f"  • Quality metrics: answer_quality_metrics.csv")
print(f"  • Faithfulness tests: faithfulness_tests.csv")
print(f"  • Ground truth: ground_truth_qa.csv")
print(f"  • Examples: example_qa_pairs.json")

Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-05-01 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Montelukast 10mg
  • Fluticasone 250mcg

Note 2 - 2024-09-24 (CURRENT MEDICATIONS:):
CURRENT MEDICATIONS:
  • Rosuvastatin 10mg
  • Atorvastatin 20mg

PATIENT QUESTION: What medications am I currently taking and why?

INSTRUCTIONS:
1. Answer the question in a clear, conversational way
2. Combine information from multiple visits if needed
3. Always cite the visit date like this: [Visit: 2024-05-01]
4. Explain what the medications/results mean if relevant
5. Do NOT just copy the notes - explain them

Your answer: 


⏳ Generating answer (this takes ~30-60 sec on CPU)...
Prompt given: You are helping a patient understand their medical records.

CLINICAL NOTES:
Note 1 - 2024-05-01 (LAB RESULTS:):
LAB RESULTS:
  • Peak Flow: 323.0 [ABNORMAL]

Note 2 - 2024-06-08 (LAB RESULTS:):
LAB RESULTS:
  • HbA1c: 9.2 [ABNORMAL]
  • Fasting Glucos

In [11]:
# MLOps artifact logging (with int64 fix)
mlops_log = {
    "notebook": "05_qa_generation",
    "execution_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model": {
        "llm": "TinyLlama-1.1B-Chat",  # Updated to reflect actual model
        "embedding": MODEL_NAME,
        "temperature": 0.7  # Updated to match actual config
    },
    "retrieval_config": {
        "top_k": 3,  # Updated to match actual config
        "similarity_threshold": 0.0  # Updated to match actual config
    },
    "testing": {
        "patients_tested": int(len(patients[:3])),  # Convert to Python int
        "query_types": int(len(test_queries)) if 'test_queries' in locals() else 5,
        "total_qa_generated": int(len(quality_df)) if 'quality_df' in locals() else 0
    },
    "quality_metrics": {
        "avg_chunks_per_answer": float(round(quality_df['chunks_retrieved'].mean(), 2)) if 'quality_df' in locals() else 0.0,
        "avg_citations_per_answer": float(round(quality_df['citations_count'].mean(), 2)) if 'quality_df' in locals() else 0.0,
        "answers_with_citations_pct": float(round((quality_df['has_citations'].sum() / len(quality_df) * 100), 1)) if 'quality_df' in locals() and len(quality_df) > 0 else 0.0,
        "avg_quality_score": float(round(quality_df['quality_score'].mean(), 3)) if 'quality_df' in locals() else 0.0
    },
    "faithfulness": {
        "tests_run": int(len(faithfulness_df)) if 'faithfulness_df' in locals() else 0,
        "correctly_refused": int(faithfulness_df['admitted_no_info'].sum()) if 'faithfulness_df' in locals() else 0,
        "hallucination_rate": float(round((1 - faithfulness_df['admitted_no_info'].sum() / len(faithfulness_df)) * 100, 1)) if 'faithfulness_df' in locals() and len(faithfulness_df) > 0 else 0.0
    },
    "validation": {
        "ground_truth_cases": int(len(gt_df)) if 'gt_df' in locals() else 0,
        "validation_passed": int(gt_df['validation_passed'].sum()) if 'gt_df' in locals() else 0
    }
}

mlops_log_path = f"{OUTPUT_DIR}/mlops_qa_generation_log.json"
with open(mlops_log_path, 'w') as f:
    json.dump(mlops_log, f, indent=2)

print("\n" + "="*70)
print("✅ NOTEBOOK 05 COMPLETE")
print("="*70)

# Safe printing with checks
avg_chunks = quality_df['chunks_retrieved'].mean() if 'quality_df' in locals() and len(quality_df) > 0 else 0
avg_citations = quality_df['citations_count'].mean() if 'quality_df' in locals() and len(quality_df) > 0 else 0
citation_pct = (quality_df['has_citations'].sum() / len(quality_df) * 100) if 'quality_df' in locals() and len(quality_df) > 0 else 0
quality_score = quality_df['quality_score'].mean() if 'quality_df' in locals() and len(quality_df) > 0 else 0
hallucination = mlops_log['faithfulness']['hallucination_rate']

print(f"""
📂 Outputs:
  • MLOps log: {mlops_log_path}
  • Quality metrics: {QA_DIR}/answer_quality_metrics.csv (if cells 5-8 were run)
  • Faithfulness tests: {QA_DIR}/faithfulness_tests.csv (if cells 6-7 were run)
  • Ground truth Q&A: {QA_DIR}/ground_truth_qa.csv (if cell 8 was run)

📊 Performance Summary:
  • Avg chunks per answer: {avg_chunks:.1f}
  • Avg citations per answer: {avg_citations:.1f}
  • Answers with citations: {citation_pct:.1f}%
  • Quality score: {quality_score:.2f}/1.00
  • Hallucination rate: {hallucination}%

✅ RAG System Status: READY FOR DEMO
""")


✅ NOTEBOOK 05 COMPLETE

📂 Outputs:
  • MLOps log: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/05_qa_generation/outputs/mlops_qa_generation_log.json
  • Quality metrics: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/05_qa_generation/outputs/qa_pairs_validation/answer_quality_metrics.csv (if cells 5-8 were run)
  • Faithfulness tests: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/05_qa_generation/outputs/qa_pairs_validation/faithfulness_tests.csv (if cells 6-7 were run)
  • Ground truth Q&A: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/05_qa_generation/outputs/qa_pairs_validation/ground_truth_qa.csv (if cell 8 was run)

📊 Performance Summary:
  • Avg chunks per answer: 1.1
  • Avg citations per answer: 0.0
  • Answers with citations: 0.0%
  • Quality score: 0.49/1.00
  • Hallucination rate: 0.0%

✅ RAG System Status: READY FOR DEMO

