In [2]:

# Evaluates BM25, Dense, and Hybrid retrieval methods using precision,
# recall, MRR, and coverage metrics on a standard query dataset.

import pandas as pd
import numpy as np
import pickle
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import re
from typing import List, Dict, Tuple, Optional
from collections import Counter
import json
from pathlib import Path

# ================================================================
# Comprehensive Evaluator Class
# ================================================================
class ComprehensiveEvaluator:
    """Comprehensive evaluation framework for retrieval and answer generation"""

    def __init__(self):
        self.chunks = []
        self.embeddings = None
        self.model = None
        self.evaluation_queries = []
        self.ground_truth = {}
        self.retrieval_results = {}
        self.evaluation_metrics = {}

    # ------------------------------------------------------------
    # 🔍 Codespaces path auto-detection
    # ------------------------------------------------------------
    def _detect_data_path(self) -> Path:
        possible_dirs = [
            Path("notebooks/data/processed"),
            Path("data/processed"),
            Path("processed"),
            Path("data")
        ]
        for d in possible_dirs:
            if d.exists() and any("retrieval_system" in str(p) for p in d.glob("*")):
                print(f"📂 Found retrieval system under: {d.resolve()}")
                return d
        print("⚠️ Retrieval system directory not found — defaulting to current path.")
        return Path(".")

    # ------------------------------------------------------------
    # 📦 Load retrieval system (embeddings + metadata)
    # ------------------------------------------------------------
    def load_retrieval_system(self, prefix: str = 'retrieval_system'):
        data_dir = self._detect_data_path()
        prefix_path = data_dir / prefix
        print(f"📂 Loading retrieval system for evaluation from {prefix_path} ...")

        try:
            self.embeddings = np.load(f"{prefix_path}_embeddings.npy")
            with open(f"{prefix_path}_metadata.pkl", 'rb') as f:
                metadata = pickle.load(f)

            self.chunks = metadata['chunks']
            model_name = metadata['model_name']

            if model_name != 'tfidf_fallback':
                print(f"🤖 Loading embedding model: {model_name}")
                self.model = SentenceTransformer(model_name)

            print(f"✅ Loaded {len(self.chunks)} chunks | {self.embeddings.shape[1]}D embeddings")
            return True

        except Exception as e:
            print(f"❌ Error loading system: {e}")
            return False

    # ------------------------------------------------------------
    # 🧩 Create evaluation dataset
    # ------------------------------------------------------------
    def create_evaluation_dataset(self) -> Dict:
        print(f"📋 Creating evaluation dataset...")

        evaluation_data = {
            "What is physics?": {"expected_articles": ["Physics"],
                "expected_chunk_types": ["title_beginning", "definitions"],
                "query_type": "definition", "difficulty": "easy"},
            "Define artificial intelligence": {"expected_articles": ["Artificial intelligence"],
                "expected_chunk_types": ["definitions", "title_beginning"],
                "query_type": "definition", "difficulty": "easy"},
            "What is quantum mechanics?": {"expected_articles": ["Quantum mechanics"],
                "expected_chunk_types": ["title_beginning", "definitions"],
                "query_type": "definition", "difficulty": "medium"},
            "Define machine learning": {"expected_articles": ["Machine learning"],
                "expected_chunk_types": ["definitions", "title_beginning"],
                "query_type": "definition", "difficulty": "medium"},
            "What is calculus?": {"expected_articles": ["Calculus"],
                "expected_chunk_types": ["title_beginning", "definitions"],
                "query_type": "definition", "difficulty": "easy"},
            "How does photosynthesis work?": {"expected_articles": ["Biology", "Photosynthesis"],
                "expected_chunk_types": ["content"], "query_type": "process", "difficulty": "hard"},
            "How does machine learning work?": {"expected_articles": ["Machine learning"],
                "expected_chunk_types": ["content", "definitions"],
                "query_type": "process", "difficulty": "medium"},
            "Explain DNA structure": {"expected_articles": ["Biology", "Genetics", "DNA"],
                "expected_chunk_types": ["content"],
                "query_type": "explanation", "difficulty": "hard"},
            "What is the theory of relativity?": {"expected_articles": ["Physics", "Theory of relativity", "Einstein"],
                "expected_chunk_types": ["content", "definitions"],
                "query_type": "concept", "difficulty": "hard"},
            "What is climate change?": {"expected_articles": ["Climate change"],
                "expected_chunk_types": ["title_beginning", "definitions"],
                "query_type": "definition", "difficulty": "medium"},
            "What is organic chemistry?": {"expected_articles": ["Chemistry", "Organic chemistry"],
                "expected_chunk_types": ["definitions", "title_beginning"],
                "query_type": "definition", "difficulty": "medium"},
            "Define neural networks": {"expected_articles": ["Machine learning", "Artificial intelligence", "Neural networks"],
                "expected_chunk_types": ["content", "definitions"],
                "query_type": "definition", "difficulty": "hard"},
            "What is bioinformatics?": {"expected_articles": ["Biology", "Computer science", "Bioinformatics"],
                "expected_chunk_types": ["content", "definitions"],
                "query_type": "definition", "difficulty": "hard"},
            "How does evolution work?": {"expected_articles": ["Biology", "Evolution"],
                "expected_chunk_types": ["content", "title_beginning"],
                "query_type": "process", "difficulty": "medium"},
            "What is linear algebra?": {"expected_articles": ["Mathematics", "Linear algebra"],
                "expected_chunk_types": ["definitions", "title_beginning"],
                "query_type": "definition", "difficulty": "medium"}
        }

        self.evaluation_queries = list(evaluation_data.keys())
        self.ground_truth = evaluation_data

        print(f"✅ Created evaluation dataset with {len(self.evaluation_queries)} queries")
        query_types = Counter(d['query_type'] for d in evaluation_data.values())
        difficulties = Counter(d['difficulty'] for d in evaluation_data.values())
        print(f"   🎯 Query types: {dict(query_types)}")
        print(f"   📈 Difficulties: {dict(difficulties)}")
        return evaluation_data

    # ------------------------------------------------------------
    # 🔹 Evaluate BM25 (TF-IDF cosine)
    # ------------------------------------------------------------
    def evaluate_bm25_retrieval(self, k: int = 5) -> Dict:
        print(f"🔍 Evaluating BM25 retrieval...")
        chunk_texts = [chunk['text'] for chunk in self.chunks]
        vectorizer = TfidfVectorizer(
            max_features=5000, stop_words='english',
            ngram_range=(1, 2), min_df=2, max_df=0.8)
        tfidf_matrix = vectorizer.fit_transform(chunk_texts)
        results = {}
        for query in self.evaluation_queries:
            qv = vectorizer.transform([query])
            sims = cosine_similarity(qv, tfidf_matrix).flatten()
            top_idx = np.argsort(sims)[::-1][:k]
            res = []
            for idx in top_idx:
                chunk = self.chunks[idx]
                res.append({
                    'chunk_idx': int(idx), 'score': float(sims[idx]),
                    'title': chunk['metadata']['title'],
                    'chunk_type': chunk['chunk_type'],
                    'priority': chunk['priority'], 'text': chunk['text']
                })
            results[query] = res
        print(f"✅ BM25 evaluation complete")
        return results

    # ------------------------------------------------------------
    # 🔹 Evaluate dense retrieval (Sentence Transformers)
    # ------------------------------------------------------------
    def evaluate_dense_retrieval(self, k: int = 5) -> Dict:
        print(f"🔍 Evaluating dense retrieval...")
        if self.model is None:
            print("❌ No embedding model available")
            return {}
        index = faiss.IndexFlatIP(self.embeddings.shape[1])
        embs = self.embeddings.copy()
        faiss.normalize_L2(embs)
        index.add(embs.astype('float32'))
        results = {}
        for query in self.evaluation_queries:
            q_emb = self.model.encode([query])
            faiss.normalize_L2(q_emb.astype('float32'))
            scores, idxs = index.search(q_emb.astype('float32'), k)
            res = []
            for i, s in zip(idxs[0], scores[0]):
                if i < len(self.chunks):
                    c = self.chunks[i]
                    res.append({'chunk_idx': int(i), 'score': float(s),
                                'title': c['metadata']['title'],
                                'chunk_type': c['chunk_type'],
                                'priority': c['priority'], 'text': c['text']})
            results[query] = res
        print(f"✅ Dense retrieval evaluation complete")
        return results

    # ------------------------------------------------------------
    # 🔹 Evaluate hybrid retrieval
    # ------------------------------------------------------------
    def evaluate_hybrid_retrieval(self, k: int = 5) -> Dict:
        print(f"🔍 Evaluating hybrid retrieval...")
        pri_w = {'HIGH': 1.5, 'MEDIUM': 1.0}
        type_w = {'title_beginning': 1.3, 'definitions': 1.4, 'content': 1.0}
        index = faiss.IndexFlatIP(self.embeddings.shape[1])
        embs = self.embeddings.copy()
        faiss.normalize_L2(embs)
        index.add(embs.astype('float32'))
        results = {}
        for query in self.evaluation_queries:
            q_emb = self.model.encode([query])
            faiss.normalize_L2(q_emb.astype('float32'))
            init_k = min(k * 3, len(self.chunks))
            scores, idxs = index.search(q_emb.astype('float32'), init_k)
            enhanced = []
            for i, s in zip(idxs[0], scores[0]):
                if i < len(self.chunks):
                    c = self.chunks[i]
                    pw = pri_w.get(c['priority'], 1.0)
                    ql = query.lower()
                    if any(p in ql for p in ['what is', 'define']):
                        tw = 1.5 if c['chunk_type'] == 'definitions' else 1.3 if c['chunk_type'] == 'title_beginning' else 1.0
                    else:
                        tw = type_w.get(c['chunk_type'], 1.0)
                    enhanced.append({'chunk_idx': int(i), 'score': float(s)*pw*tw,
                                     'original_score': float(s),
                                     'title': c['metadata']['title'],
                                     'chunk_type': c['chunk_type'],
                                     'priority': c['priority'], 'text': c['text']})
            enhanced.sort(key=lambda x: x['score'], reverse=True)
            results[query] = enhanced[:k]
        print(f"✅ Hybrid retrieval evaluation complete")
        return results

    # ------------------------------------------------------------
    # 📈 Compute metrics (Precision@5, Recall@5, MRR)
    # ------------------------------------------------------------
    def calculate_retrieval_metrics(self, results: Dict, method_name: str) -> Dict:
        print(f"📊 Calculating metrics for {method_name}...")
        m = {'precision_at_5': [], 'recall_at_5': [], 'mrr': [],
             'high_priority_coverage': [], 'beginning_content_coverage': []}
        detailed = []
        for q, res in results.items():
            if q not in self.ground_truth: continue
            gt = self.ground_truth[q]
            exp_articles = set(gt['expected_articles'])
            exp_types = set(gt['expected_chunk_types'])
            ret_articles = set(r['title'] for r in res)
            ret_types = set(r['chunk_type'] for r in res)
            pri = [r['priority'] for r in res]
            rel = len(exp_articles & ret_articles)
            prec = rel / min(5, len(res)) if res else 0
            rec = min(1.0, rel) if exp_articles else 0
            mrr = 0
            for i, r in enumerate(res):
                if r['title'] in exp_articles:
                    mrr = 1.0 / (i + 1)
                    break
            high_cov = sum(p == 'HIGH' for p in pri) / len(res) if res else 0
            begin_cov = 1.0 if exp_types & ret_types else 0.0
            for key, val in zip(m.keys(), [prec, rec, mrr, high_cov, begin_cov]):
                m[key].append(val)
            detailed.append({'query': q, 'query_type': gt['query_type'],
                             'difficulty': gt['difficulty'],
                             'precision_at_5': prec, 'recall_at_5': rec,
                             'mrr': mrr, 'high_priority_coverage': high_cov,
                             'beginning_content_coverage': begin_cov,
                             'expected_articles': list(exp_articles),
                             'retrieved_articles': list(ret_articles),
                             'relevant_found': list(exp_articles & ret_articles)})
        avg = {k: np.mean(v) for k, v in m.items()}
        print(f"✅ {method_name} metrics calculated")
        return {'method': method_name, 'avg_metrics': avg,
                'detailed_results': detailed, 'raw_metrics': m}

    # ------------------------------------------------------------
    # 🚀 Run comprehensive evaluation
    # ------------------------------------------------------------
    def run_comprehensive_evaluation(self) -> Dict:
        print(f"🚀 COMPREHENSIVE RETRIEVAL EVALUATION")
        print("=" * 70)
        self.create_evaluation_dataset()
        print(f"\n📊 EVALUATING RETRIEVAL METHODS")
        print("-" * 50)
        bm25 = self.evaluate_bm25_retrieval()
        dense = self.evaluate_dense_retrieval()
        hybrid = self.evaluate_hybrid_retrieval()
        bm25_m = self.calculate_retrieval_metrics(bm25, "BM25")
        dense_m = self.calculate_retrieval_metrics(dense, "Dense")
        hybrid_m = self.calculate_retrieval_metrics(hybrid, "Hybrid")
        self.retrieval_results = {'bm25': bm25, 'dense': dense, 'hybrid': hybrid}
        self.evaluation_metrics = {'bm25': bm25_m, 'dense': dense_m, 'hybrid': hybrid_m}
        self.print_comparison_table()
        return {'results': self.retrieval_results,
                'metrics': self.evaluation_metrics,
                'evaluation_dataset': self.ground_truth}

    # ------------------------------------------------------------
    # 🧾 Display comparison table
    # ------------------------------------------------------------
    def print_comparison_table(self):
        print(f"\n📊 RETRIEVAL EVALUATION RESULTS")
        print("=" * 70)
        methods = ['BM25', 'Dense', 'Hybrid']
        print(f"{'Method':<12} {'Precision@5':<12} {'Recall@5':<10} {'MRR':<8} "
              f"{'HighPriority':<13} {'BeginContent':<13}")
        print("-" * 75)
        for key, name in zip(['bm25', 'dense', 'hybrid'], methods):
            a = self.evaluation_metrics[key]['avg_metrics']
            print(f"{name:<12} {a['precision_at_5']:.2f}        {a['recall_at_5']:.2f}      "
                  f"{a['mrr']:.2f}     {a['high_priority_coverage']:.2f}          "
                  f"{a['beginning_content_coverage']:.2f}")
        best = max(self.evaluation_metrics.keys(),
                   key=lambda x: self.evaluation_metrics[x]['avg_metrics']['mrr'])
        print(f"\n🏆 Best overall method: {best.upper()}")

    # ------------------------------------------------------------
    # 💾 Save results to JSON
    # ------------------------------------------------------------
    def save_evaluation_results(self, filename: str = 'evaluation_results.json'):
        res = {
            'evaluation_dataset': self.ground_truth,
            'metrics_summary': {m: d['avg_metrics'] for m, d in self.evaluation_metrics.items()},
            'detailed_results': {m: d['detailed_results'] for m, d in self.evaluation_metrics.items()}
        }
        with open(filename, 'w') as f:
            json.dump(res, f, indent=2)
        print(f"💾 Evaluation results saved to {filename}")
        return filename


# ================================================================
# Main entrypoint
# ================================================================
def run_comprehensive_evaluation():
    print("🚀 STEP 4.5: COMPREHENSIVE EVALUATION FRAMEWORK (Codespaces Ready)")
    print("=" * 70)
    evaluator = ComprehensiveEvaluator()
    if not evaluator.load_retrieval_system():
        print("❌ Failed to load retrieval system")
        return None
    results = evaluator.run_comprehensive_evaluation()
    evaluator.save_evaluation_results()
    print(f"\n✅ STEP 4.5 COMPLETE!")
    print(f"📋 Evaluated BM25, Dense & Hybrid retrieval methods")
    print(f"📊 Results stored in evaluation_results.json")
    return evaluator, results


if __name__ == "__main__":
    evaluator, results = run_comprehensive_evaluation()


🚀 STEP 4.5: COMPREHENSIVE EVALUATION FRAMEWORK (Codespaces Ready)
📂 Found retrieval system under: /workspaces/Rag-Knowledge-Assiatant/notebooks/data/processed
📂 Loading retrieval system for evaluation from data/processed/retrieval_system ...
🤖 Loading embedding model: all-MiniLM-L6-v2
✅ Loaded 3022 chunks | 384D embeddings
🚀 COMPREHENSIVE RETRIEVAL EVALUATION
📋 Creating evaluation dataset...
✅ Created evaluation dataset with 15 queries
   🎯 Query types: {'definition': 10, 'process': 3, 'explanation': 1, 'concept': 1}
   📈 Difficulties: {'easy': 3, 'medium': 7, 'hard': 5}

📊 EVALUATING RETRIEVAL METHODS
--------------------------------------------------
🔍 Evaluating BM25 retrieval...
✅ BM25 evaluation complete
🔍 Evaluating dense retrieval...
✅ Dense retrieval evaluation complete
🔍 Evaluating hybrid retrieval...
✅ Hybrid retrieval evaluation complete
📊 Calculating metrics for BM25...
✅ BM25 metrics calculated
📊 Calculating metrics for Dense...
✅ Dense metrics calculated
📊 Calculating met