In [1]:
!pip install -q google-generativeai faiss-cpu pandas numpy tqdm streamlit


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q faiss-cpu

In [3]:
import os
import json
import zipfile
import pandas as pd
from tqdm import tqdm
import numpy as np
import faiss
import google.generativeai as genai

In [5]:
zip_path = "/content/mimic-iv-ext-direct-1.0.0.zip"
extract_path = "/content/mimic-iv-ext/"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted to:", extract_path)


Dataset extracted to: /content/mimic-iv-ext/


In [43]:
for root, dirs, files in os.walk(extract_path):
    for f in files:
        if f.endswith(".json"):
            print(os.path.join(root, f))


/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Migraine/Migraine With Aura/._17676552-DS-10.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Migraine/Migraine With Aura/._18427803-DS-5.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Migraine/Migraine With Aura/._18805216-DS-21.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Migraine/Migraine Without Aura/._17185323-DS-14.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Thyroid Disease/Hypothyroidism/._11697485-DS-10.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Thyroid Disease/Hypothyroidism/._14596661-DS-6.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Thyroid Disease/Hypothyroidism/._13561991-DS-10.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/Finished/Thyroid Disease/Thyroid Nodules/._16973388-DS-20.json
/content/mimic-iv-ext/__MACOSX/mimic-iv-ext-direct-1.0.0/F

In [45]:
documents = []

for root, _, files in os.walk(extract_path):
    for f in files:
        if f.endswith(".json"):
            file_path = os.path.join(root, f)
            with open(file_path, "r") as file:
                try:
                    data = json.load(file)
                except:
                    continue

                # Add each JSON record as a document
                if isinstance(data, list):
                    for record in data:
                        # Convert JSON to readable text
                        readable_text = json.dumps(record, indent=0).replace('\n', ' ')
                        documents.append(readable_text)
                else:
                    readable_text = json.dumps(data, indent=0).replace('\n', ' ')
                    documents.append(readable_text)

print("Total Documents Loaded:", len(documents))

# Show sample without breaking loop
if documents:
    print("\n📄 Sample document (first 200 chars):")
    print(documents[0][:200] + "...")

Total Documents Loaded: 511

📄 Sample document (first 200 chars):
{ "Migraine With Aura$Intermedia_3": { "Difficulty expressing language may be associated with migraine, especially when migraine is accompanied by neurological symptoms$Cause_1": { "Difficulty produci...


In [46]:
print("\n🔧 Cleaning JSON documents for better readability...")

def clean_json_document(doc):
    """Clean JSON documents to remove clutter and improve readability"""
    import re

    if isinstance(doc, str):
        # Remove excessive JSON formatting
        doc = re.sub(r'\s*\{\s*"', '"', doc)
        doc = re.sub(r'"\s*:\s*\{', ': ', doc)
        doc = re.sub(r'\}\s*,?\s*\}', '}', doc)
        doc = re.sub(r'\$[A-Za-z0-9_]+', '', doc)  # Remove $Intermedia_5 etc
        doc = re.sub(r'\*?\$Input\d+', '', doc)
        doc = re.sub(r'([A-Z][a-z]+)\$\w+', r'\1', doc)

        # Keep it readable length
        if len(doc) > 800:
            doc = doc[:800] + "..."

    return doc

# Apply cleaning
documents = [clean_json_document(doc) for doc in documents]

print(f"✅ Documents cleaned! Now have {len(documents)} documents")
print("\n📄 Before cleaning preview:")
print("Raw: Migraine With Aura$Intermedia_3...")
print("\n📄 After cleaning preview:")
print(documents[0][:200] + "...")



🔧 Cleaning JSON documents for better readability...
✅ Documents cleaned! Now have 511 documents

📄 Before cleaning preview:
Raw: Migraine With Aura$Intermedia_3...

📄 After cleaning preview:
"Migraine With Aura":"Difficulty expressing language may be associated with migraine, especially when migraine is accompanied by neurological symptoms":"Difficulty producing speech: }, "Persistent hea...


In [20]:
!pip install -q sentence-transformers


In [21]:
from sentence_transformers import SentenceTransformer

In [48]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # 80MB, downloads in 5s

embeddings = model.encode(documents, batch_size=32, show_progress_bar=True)
print("First document embedding (first 10 dimensions):", embeddings[0][:10])
print(f"✅ Done! Shape: {embeddings.shape}")

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

First document embedding (first 10 dimensions): [ 0.05549387 -0.0644675   0.04017159  0.03721104 -0.05382313  0.06045882
  0.12579285  0.0084793   0.00986471 -0.02953939]
✅ Done! Shape: (511, 384)


In [49]:
import faiss
import numpy as np

# Convert to float32 FIRST
embeddings_f32 = embeddings.astype('float32')  # ⬅️ CRITICAL

d = embeddings_f32.shape[1]   # dimension
index = faiss.IndexFlatL2(d)
index.add(embeddings_f32)     # ⬅️ Use float32 version

print("✅ FAISS index created with", index.ntotal, "documents of dimension", d)

# Test it works
test_query = "migraine headache"
test_emb = model.encode([test_query]).astype('float32')
distances, indices = index.search(test_emb, 3)

print(f"\n🔍 Test query: '{test_query}'")
print(f"Retrieved documents: {indices[0]}")
print(f"Unique documents: {len(set(indices[0]))}")

✅ FAISS index created with 511 documents of dimension 384

🔍 Test query: 'migraine headache'
Retrieved documents: [1 0 3]
Unique documents: 3


In [50]:
def retrieve_with_scores(query, top_k=5):
    """Retrieve documents with similarity scores"""
    q_emb = model.encode([query]).astype('float32')
    distances, idx = index.search(q_emb, top_k)

    # Print what happened during retrieval
    print(f"Query: '{query}'")
    print(f"Query embedding shape: {q_emb.shape}")
    print(f"Retrieved indices: {idx[0]}")
    print(f"Distances: {distances[0]}")

    results = []
    for i, (doc_idx, distance) in enumerate(zip(idx[0], distances[0])):
        if doc_idx != -1:  # ⬅️ ADD THIS LINE to skip invalid indices
            similarity = 1 / (1 + distance)  # Simple conversion
            results.append({
                'rank': i+1,
                'document': documents[doc_idx],
                'similarity': round(float(similarity), 4)
            })

    print(f"Top result similarity score: {results[0]['similarity'] if results else 'No results'}")
    return results

In [51]:
genai.configure(api_key="AIzaSyCxL29qRgP96gaWKCU-p-yvFKgvq2J52gM")

In [53]:
generation_model = genai.GenerativeModel("gemini-2.5-flash")

def rag_answer_smart(query, top_k=3):
    """Smart RAG: Uses patient records when relevant, supplements with general knowledge"""

    print(f"\n{'='*60}")
    print(f"🤖 SMART RAG Pipeline Started")
    print(f"📝 Query: '{query}'")
    print(f"🔍 Retrieving top {top_k} most relevant documents...")

    # 1. Retrieve documents with similarity scores
    retrieved = retrieve_with_scores(query, top_k=top_k)

    print(f"📄 Documents retrieved: {len(retrieved)}")
    if retrieved:
        print(f"📊 Highest similarity: {retrieved[0]['similarity']:.4f}")
        print(f"📊 Lowest similarity: {retrieved[-1]['similarity']:.4f}")

    # 2. Check relevance threshold
    all_low_relevance = all(r['similarity'] < 0.3 for r in retrieved)

    if all_low_relevance and retrieved:
        print(f"⚠️  All documents have low relevance (< 0.3) - using general knowledge mode")
        # GENERAL KNOWLEDGE MODE
        prompt = f"""MEDICAL QUESTION: {query}

You are a medical expert. Provide accurate, evidence-based information.
Follow these guidelines:
1. Be comprehensive and factual
2. Cite sources/guidelines if applicable (e.g., ICHD-3, ACC/AHA)
3. Include diagnostic criteria, symptoms, treatments as relevant
4. Use clear formatting with bullet points or numbered lists
5. If query mentions specific patient terms, explain them generally

PROFESSIONAL ANSWER:"""

        mode = "general_knowledge"

    else:
        # RAG + SUPPLEMENT MODE
        print(f"✅ Found relevant documents - using RAG with supplementation")

        # Format context with relevance scores
        ctx_formatted = []
        for i, r in enumerate(retrieved):
            # Show relevance indicator
            relevance_indicator = "🟢" if r['similarity'] > 0.4 else "🟡" if r['similarity'] > 0.3 else "🔴"
            doc_preview = r['document'][:500] + "..." if len(r['document']) > 500 else r['document']
            ctx_formatted.append(f"{relevance_indicator} [Document {i+1}, Relevance: {r['similarity']:.3f}]:\n{doc_preview}")

        context_str = "\n\n".join(ctx_formatted)

        print(f"📋 Context prepared: {len(ctx_formatted)} documents")
        print(f"📏 Total context length: {len(context_str)} characters")

        prompt = f"""You are a medical analyst assistant. Analyze the query and provide the best answer.

USER QUESTION: {query}

RELEVANT PATIENT RECORDS (Most relevant first):
{context_str}

INSTRUCTIONS:
1. **FIRST** check if the patient records contain relevant information
2. If records ARE relevant:
   - Start with "Based on patient records:"
   - Extract and summarize information from the records
   - Cite specific documents (Document 1, Document 2, etc.)
3. If records ARE NOT sufficient:
   - Acknowledge what was found in records
   - Add "Supplementing with general medical knowledge:"
   - Provide comprehensive information from general knowledge
4. If records are completely irrelevant to the query:
   - State "No relevant patient records found"
   - Provide a complete general medical answer
5. Always maintain medical accuracy and clarity

STRUCTURED ANSWER:"""

        mode = "rag_with_supplement"

    print(f"💭 Generating response ({mode})...")

    try:
        response = generation_model.generate_content(prompt)
        print(f"✅ Response generated successfully ({len(response.text)} characters)")
        print(f"{'='*60}\n")

        # Add mode info to return
        return response.text, retrieved, mode

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        print(f"{'='*60}\n")
        return f"Error: {str(e)}", retrieved, "error"

In [54]:
# Test with your migraine query
query = "What are the diagnostic criteria for Migraine With Aura?"
answer, sources, mode = rag_answer_smart(query)

print("ANSWER:")
print(answer)

if sources:
    print(f"\nBased on {len(sources)} patient records")
else:
    print("\n⚠️ No relevant records found - using general medical knowledge")


🤖 SMART RAG Pipeline Started
📝 Query: 'What are the diagnostic criteria for Migraine With Aura?'
🔍 Retrieving top 3 most relevant documents...
Query: 'What are the diagnostic criteria for Migraine With Aura?'
Query embedding shape: (1, 384)
Retrieved indices: [0 2 3]
Distances: [0.5405734  0.6228178  0.62445223]
Top result similarity score: 0.6491
📄 Documents retrieved: 3
📊 Highest similarity: 0.6491
📊 Lowest similarity: 0.6156
✅ Found relevant documents - using RAG with supplementation
📋 Context prepared: 3 documents
📏 Total context length: 1615 characters
💭 Generating response (rag_with_supplement)...
✅ Response generated successfully (1740 characters)

ANSWER:
Based on patient records:
The records indicate that Migraine With Aura can be associated with "Difficulty expressing language," "Difficulty producing speech," and "Reduced speech output," which may present as impaired or altered quality of speech alongside persistent headaches (Document 1). Visual symptoms like "Blurred visio

In [55]:
# Test with queries that SHOULD match your documents
specific_queries = [
    "Show me Document 1 content",
    "What does the first patient record say?",
    "Extract information from Document 2",
    "What medications are mentioned in the records?"
]

print("\n🧪 Testing with document-specific queries:")
for query in specific_queries:
    print(f"\n🔍 Query: {query}")
    answer, sources, mode = rag_answer_smart(query)
    print(f"Mode: {mode} | Similarity: {sources[0]['similarity']:.3f}")
    print(f"Answer: {answer[:200]}...")


🧪 Testing with document-specific queries:

🔍 Query: Show me Document 1 content

🤖 SMART RAG Pipeline Started
📝 Query: 'Show me Document 1 content'
🔍 Retrieving top 3 most relevant documents...
Query: 'Show me Document 1 content'
Query embedding shape: (1, 384)
Retrieved indices: [467 167 456]
Distances: [1.7588716 1.7876325 1.8004951]
Top result similarity score: 0.3625
📄 Documents retrieved: 3
📊 Highest similarity: 0.3625
📊 Lowest similarity: 0.3571
✅ Found relevant documents - using RAG with supplementation
📋 Context prepared: 3 documents
📏 Total context length: 1615 characters
💭 Generating response (rag_with_supplement)...
✅ Response generated successfully (521 characters)

Mode: rag_with_supplement | Similarity: 0.362
Answer: Based on patient records:
Document 1 discusses "Ischemic Stroke." It describes evidence of multiple acute cerebral infarctions, indicating ischemic damage to brain tissue, specifically noting tiny acu...

🔍 Query: What does the first patient record say?

🤖 SM

In [58]:
import pickle
import numpy as np
import faiss

# Save documents
with open('documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

# Save embeddings
np.save('embeddings.npy', embeddings)

# Save FAISS index
faiss.write_index(index, 'faiss_index.faiss')

print("✅ Documents, embeddings, and index saved!")

✅ Documents, embeddings, and index saved!


In [65]:
def test_retrieval(query, expected_keywords, top_k=3):
    """Test if retrieval finds documents with specific keywords"""
    results = retrieve_with_scores(query, top_k=top_k)

    print(f"\n🔍 Query: '{query}'")
    print(f"📊 Retrieved {len(results)} documents")

    for i, r in enumerate(results):
        doc_text = r['document'].lower()
        found_keywords = [kw for kw in expected_keywords if kw in doc_text]

        print(f"\nDocument {i+1} (Similarity: {r['similarity']:.3f}):")
        print(f"  Contains keywords: {found_keywords}")
        print(f"  Preview: {doc_text[:100]}...")

    return results

# Test with known queries
test_cases = [
    ("migraine", ["migraine", "aura", "headache"]),
    ("heart condition", ["heart", "cardiac", "hypertension"]),
    ("medication", ["aspirin", "drug", "prescription"])
]

for query, keywords in test_cases:
    test_retrieval(query, keywords)

Query: 'migraine'
Query embedding shape: (1, 384)
Retrieved indices: [0 1 2]
Distances: [0.59528196 0.6474576  0.6938983 ]
Top result similarity score: 0.6268

🔍 Query: 'migraine'
📊 Retrieved 3 documents

Document 1 (Similarity: 0.627):
  Contains keywords: ['migraine', 'aura', 'headache']
  Preview: "migraine with aura":"difficulty expressing language may be associated with migraine, especially whe...

Document 2 (Similarity: 0.607):
  Contains keywords: ['migraine', 'aura']
  Preview: "migraine with aura":"chills or rigors may be part of a migraine aura, reflecting changes in the aut...

Document 3 (Similarity: 0.590):
  Contains keywords: ['migraine', 'aura', 'headache']
  Preview: "migraine with aura":"blurred vision can be one of the visual symptoms associated with migraines, ca...
Query: 'heart condition'
Query embedding shape: (1, 384)
Retrieved indices: [102 428  85]
Distances: [0.8819189  0.9011301  0.90323806]
Top result similarity score: 0.5314

🔍 Query: 'heart condition'
📊 

In [66]:
# ================ 1. FIRST DEFINE EVALUATION FUNCTION ================
def evaluate_rag_triplet(query, answer, sources, ground_truth=None):
    """
    Evaluate RAG output on three metrics:
    1. Accuracy - Factual correctness
    2. Coherence - Logical flow and readability
    3. Relevance - Connection to query and sources
    """

    evaluation = {}

    # 1. ACCURACY SCORE (0-1)
    accuracy_score = 0.0

    # Rule-based accuracy checks
    accuracy_checks = []
    accuracy_checks.append(0.2 if "not a doctor" in answer.lower() or "information only" in answer.lower() else 0)
    accuracy_checks.append(0.2 if not any(word in answer.lower() for word in ["always", "never", "definitely", "guaranteed"]) else 0)
    accuracy_checks.append(0.2 if ("document" in answer.lower() or "source" in answer.lower()) else 0)
    accuracy_checks.append(0.2 if any(word in answer.lower() for word in ["may", "can", "sometimes", "often"]) else 0)
    accuracy_checks.append(0.2 if "cure" not in answer.lower() and "diagnose" not in answer.lower() else 0)

    accuracy_score = sum(accuracy_checks)

    # If ground truth provided, compare
    if ground_truth:
        answer_words = set(answer.lower().split())
        truth_words = set(ground_truth.lower().split())
        overlap = len(answer_words.intersection(truth_words)) / max(len(truth_words), 1)
        accuracy_score = max(accuracy_score, overlap)

    evaluation['accuracy'] = round(min(accuracy_score, 1.0), 3)

    # 2. COHERENCE SCORE (0-1)
    coherence_checks = []
    coherence_checks.append(0.25 if any(marker in answer for marker in ["•", "-", "1.", "2.", "**", "\n\n"]) else 0)
    coherence_checks.append(0.25 if 100 <= len(answer) <= 2000 else 0)
    coherence_checks.append(0.25 if answer.count('.') >= 2 else 0)
    coherence_checks.append(0.25 if answer.count('\n\n') >= 1 else 0)

    coherence_score = sum(coherence_checks)
    evaluation['coherence'] = round(min(coherence_score, 1.0), 3)

    # 3. RELEVANCE SCORE (0-1)
    relevance_score = 0.0

    if sources:
        top_sources = sources[:min(3, len(sources))]
        doc_relevance = sum(s['similarity'] for s in top_sources) / len(top_sources)

        query_words = set(query.lower().split())
        answer_words = set(answer.lower().split())
        word_overlap = len(query_words.intersection(answer_words)) / max(len(query_words), 1)

        relevance_score = 0.6 * doc_relevance + 0.4 * min(word_overlap, 1.0)
    else:
        query_keywords = ["what", "how", "when", "where", "why", "explain", "describe", "list"]
        if any(kw in query.lower() for kw in query_keywords) and len(answer) > 50:
            relevance_score = 0.5

    evaluation['relevance'] = round(min(relevance_score, 1.0), 3)

    # OVERALL SCORE
    evaluation['overall'] = round(
        0.4 * evaluation['accuracy'] +
        0.3 * evaluation['coherence'] +
        0.3 * evaluation['relevance'], 3
    )

    return evaluation


# ================ 2. THEN DEFINE BATCH EVALUATION ================
def batch_evaluate_rag_smart(test_queries, ground_truth_dict=None):
    """Evaluate multiple queries using SMART RAG"""

    results = []

    print("📊 SMART RAG EVALUATION RESULTS")
    print("="*70)
    print(f"{'Query':<40} {'Accuracy':<10} {'Coherence':<10} {'Relevance':<10} {'Overall':<10}")
    print("-"*70)

    for query in test_queries:
        # Get SMART RAG answer
        answer, sources, mode = rag_answer_smart(query)

        # Get ground truth if available
        ground_truth = ground_truth_dict.get(query) if ground_truth_dict else None

        # Evaluate using the function defined above
        eval_result = evaluate_rag_triplet(query, answer, sources, ground_truth)

        # Store results
        result = {
            'query': query,
            'answer': answer[:100] + "..." if len(answer) > 100 else answer,
            'sources_used': len(sources),
            'top_similarity': sources[0]['similarity'] if sources else 0,
            'mode': mode,
            **eval_result
        }
        results.append(result)

        # Print row
        print(f"{query[:37]:<37}... {eval_result['accuracy']:<10.3f} {eval_result['coherence']:<10.3f} "
              f"{eval_result['relevance']:<10.3f} {eval_result['overall']:<10.3f}")

    print("="*70)

    # Calculate averages
    if results:
        avg_accuracy = sum(r['accuracy'] for r in results) / len(results)
        avg_coherence = sum(r['coherence'] for r in results) / len(results)
        avg_relevance = sum(r['relevance'] for r in results) / len(results)
        avg_overall = sum(r['overall'] for r in results) / len(results)

        print(f"\n📈 AVERAGES:")
        print(f"Accuracy:  {avg_accuracy:.3f}")
        print(f"Coherence: {avg_coherence:.3f}")
        print(f"Relevance: {avg_relevance:.3f}")
        print(f"Overall:   {avg_overall:.3f}")

        # Show mode distribution
        modes = [r['mode'] for r in results]
        mode_counts = {mode: modes.count(mode) for mode in set(modes)}
        print(f"\n🎯 MODE USAGE:")
        for mode, count in mode_counts.items():
            print(f"  {mode}: {count} queries")

    return results


# ================ 3. TEST DATA ================
test_queries = [
    "What is migraine with aura?",
    "What heart conditions are mentioned in records?",
    "Describe patient symptoms",
    "What medications are used?"
]

ground_truth_examples = {
    "What is migraine with aura?": "Migraine with aura involves neurological symptoms before headache including visual disturbances.",
    "What heart conditions are mentioned in records?": "Heart conditions may include coronary artery disease, heart failure, or hypertension.",
}


# ================ 4. RUN EVALUATION ================
print("🧪 Evaluating SMART RAG System...")
evaluation_results = batch_evaluate_rag_smart(test_queries, ground_truth_examples)

# ================ 5. SHOW DETAILED RESULTS ================
print("\n🔍 DETAILED RESULTS:")
for result in evaluation_results:
    print(f"\nQuery: {result['query']}")
    print(f"Mode: {result['mode']}")
    print(f"Answer preview: {result['answer']}")
    print(f"Sources: {result['sources_used']}, Top similarity: {result['top_similarity']:.3f}")
    print(f"Scores - Accuracy: {result['accuracy']:.3f}, Coherence: {result['coherence']:.3f}, "
          f"Relevance: {result['relevance']:.3f}, Overall: {result['overall']:.3f}")

🧪 Evaluating SMART RAG System...
📊 SMART RAG EVALUATION RESULTS
Query                                    Accuracy   Coherence  Relevance  Overall   
----------------------------------------------------------------------

🤖 SMART RAG Pipeline Started
📝 Query: 'What is migraine with aura?'
🔍 Retrieving top 3 most relevant documents...
Query: 'What is migraine with aura?'
Query embedding shape: (1, 384)
Retrieved indices: [2 0 3]
Distances: [0.6261587  0.63675845 0.6846194 ]
Top result similarity score: 0.6149
📄 Documents retrieved: 3
📊 Highest similarity: 0.6149
📊 Lowest similarity: 0.5936
✅ Found relevant documents - using RAG with supplementation
📋 Context prepared: 3 documents
📏 Total context length: 1615 characters
💭 Generating response (rag_with_supplement)...
✅ Response generated successfully (775 characters)

What is migraine with aura?          ... 0.800      1.000      0.604      0.801     

🤖 SMART RAG Pipeline Started
📝 Query: 'What heart conditions are mentioned in records?'


In [71]:
def analyze_rag_errors_fixed(test_queries):
    """Deep dive into potential errors and limitations - FIXED VERSION"""

    print("🔍 RAG ERROR ANALYSIS (FIXED)")
    print("="*70)

    error_categories = {
        'retrieval_errors': [],
        'generation_errors': [],
        'medical_safety_errors': [],
        'formatting_issues': []
    }

    for query in test_queries:
        print(f"\n{'='*60}")
        print(f"Analyzing: '{query}'")

        # ✅ FIXED: Get all 3 values from rag_answer_smart
        answer, sources, mode = rag_answer_smart(query)

        print(f"Mode used: {mode}")

        # 1. RETRIEVAL ERROR ANALYSIS
        print("\n📊 RETRIEVAL ANALYSIS:")
        if not sources:
            print("❌ CRITICAL: No documents retrieved")
            error_categories['retrieval_errors'].append(query)
        else:
            print(f"✅ Documents retrieved: {len(sources)}")

            # Check similarity distribution
            sim_scores = [s['similarity'] for s in sources]
            print(f"   Similarity range: {min(sim_scores):.3f} - {max(sim_scores):.3f}")

            if max(sim_scores) < 0.3:
                print("⚠️ WARNING: Low similarity scores (< 0.3)")
                error_categories['retrieval_errors'].append(f"{query} - low similarity")

        # 2. GENERATION ERROR ANALYSIS
        print("\n🤖 GENERATION ANALYSIS:")

        # Check for hallucinations
        if sources and len(sources) > 0:
            # Extract document keywords
            doc_keywords = set()
            for s in sources[:2]:
                doc_text = s['document'].lower()
                doc_keywords.update(doc_text.split()[:20])

            # Check if answer introduces new concepts
            answer_words = set(answer.lower().split())
            new_words = answer_words - doc_keywords

            medical_terms = ["treat", "cure", "diagnose", "prescribe", "recommend"]
            if any(term in answer.lower() for term in medical_terms):
                print("⚠️ WARNING: Potential medical advice/hallucination")
                error_categories['medical_safety_errors'].append(query)

        # Check for "not found" patterns
        not_found_phrases = ["not found", "no information", "not available", "does not contain"]
        if any(phrase in answer.lower() for phrase in not_found_phrases):
            print("⚠️ NOTE: Answer indicates missing information")

        # 3. FORMATTING ANALYSIS
        print("\n📝 FORMATTING ANALYSIS:")

        # Check length
        if len(answer) < 100:
            print("⚠️ WARNING: Answer very short (< 100 chars)")
            error_categories['formatting_issues'].append(f"{query} - too short")
        elif len(answer) > 1000:
            print("⚠️ NOTE: Answer very long (> 1000 chars)")

        # Check citations
        if "document" not in answer.lower() and "source" not in answer.lower():
            print("⚠️ WARNING: No document citations found")
            error_categories['formatting_issues'].append(f"{query} - no citations")

        # Check structure
        if not any(marker in answer for marker in ["•", "-", "\n", "1.", "2."]):
            print("⚠️ NOTE: Answer lacks structured formatting")

    # SUMMARY
    print(f"\n{'='*70}")
    print("📋 ERROR SUMMARY")
    print("="*70)

    total_errors = sum(len(errors) for errors in error_categories.values())

    for category, errors in error_categories.items():
        print(f"\n{category.upper().replace('_', ' ')}:")
        if errors:
            for error in errors:
                print(f"  ⚠️ {error}")
        else:
            print("  ✅ No issues detected")

    print(f"\n📊 TOTAL ERRORS FOUND: {total_errors}")

    if total_errors == 0:
        print("🎉 EXCELLENT: No significant errors detected!")
    elif total_errors <= 3:
        print("✅ GOOD: Minor issues only")
    else:
        print("⚠️ NEEDS ATTENTION: Multiple issues detected")

    return error_categories

# Run the FIXED error analysis
print("🧪 Starting Fixed Error Analysis...")
errors = analyze_rag_errors_fixed(test_queries)

🧪 Starting Fixed Error Analysis...
🔍 RAG ERROR ANALYSIS (FIXED)

Analyzing: 'What is migraine with aura?'

🤖 SMART RAG Pipeline Started
📝 Query: 'What is migraine with aura?'
🔍 Retrieving top 3 most relevant documents...
Query: 'What is migraine with aura?'
Query embedding shape: (1, 384)
Retrieved indices: [2 0 3]
Distances: [0.6261587  0.63675845 0.6846194 ]
Top result similarity score: 0.6149
📄 Documents retrieved: 3
📊 Highest similarity: 0.6149
📊 Lowest similarity: 0.5936
✅ Found relevant documents - using RAG with supplementation
📋 Context prepared: 3 documents
📏 Total context length: 1615 characters
💭 Generating response (rag_with_supplement)...
✅ Response generated successfully (2007 characters)

Mode used: rag_with_supplement

📊 RETRIEVAL ANALYSIS:
✅ Documents retrieved: 3
   Similarity range: 0.594 - 0.615

🤖 GENERATION ANALYSIS:

📝 FORMATTING ANALYSIS:
⚠️ NOTE: Answer very long (> 1000 chars)

Analyzing: 'What heart conditions are mentioned in records?'

🤖 SMART RAG Pipeline 