In [None]:
!pip install -U weaviate-client
!pip install -U llama-index
%pip install llama-index-embeddings-huggingface
!pip install llama_index.vector_stores.weaviate
%pip install llama-index-retrievers-bm25
!pip install rouge
!pip install -U transformers accelerate

from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import SimpleDirectoryReader

In [None]:
from llama_index.core import Document
import torch
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.embeddings import BaseEmbedding
from sklearn.preprocessing import normalize
# Embedding model2Ô∏è‚É£ AfriBERTa embedding (HuggingFace style)
# ============================
class AfriBERTaEmbedding(BaseEmbedding):
    def __init__(self, model_name="castorini/afriberta_base", **kwargs):
        super().__init__(**kwargs)
        self._device = "cuda" if torch.cuda.is_available() else "cpu"
        from transformers import AutoTokenizer, AutoModel
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        self._model = AutoModel.from_pretrained(model_name).to(self._device)
        self._model.eval()

    def _mean_pooling(self, token_embeddings, attention_mask):
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def _embed(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        inputs = self._tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self._device)

        with torch.no_grad():
            outputs = self._model(**inputs)
            embeddings = self._mean_pooling(outputs.last_hidden_state, inputs["attention_mask"])
            embeddings = normalize(embeddings.cpu().numpy(), norm="l2")
        return embeddings.tolist()

    # -----------------------------
    # Implement required abstract methods
    # -----------------------------
    def _get_text_embedding(self, text: str):
        return self._embed(text)[0]

    def _get_query_embedding(self, query: str):
        return self._embed(query)[0]

    async def _aget_text_embedding(self, text: str):
        return self._get_text_embedding(text)

    async def _aget_query_embedding(self, query: str):
        return self._get_query_embedding(query)

from llama_index.core.settings import Settings
Settings.embed_model = AfriBERTaEmbedding()

In [None]:
import os

os.environ["WEAVIATE_URL"] = "https://eu3nsymbtcib2x8fy120yw.c0.europe-west3.gcp.weaviate.cloud"
os.environ["WEAVIATE_API_KEY"] = "****************************************"

import os
import weaviate
from weaviate.classes.init import Auth

# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]
# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())


In [None]:
#3. Wrap your Weaviate collection
import pandas as pd
from tqdm import tqdm
from llama_index.core.node_parser import SentenceSplitter
vector_store = WeaviateVectorStore(
    weaviate_client=client,
    index_name="YorubaChunk",   # use your actual collection name

)
from llama_index.core import VectorStoreIndex, StorageContext
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# 6Ô∏è‚É£ Create index with Qdrant
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
    embed_model=Settings.embed_model
)


In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore
from collections import defaultdict

# Dense Retriever (AfriBERTa)
dense_retriever = index.as_retriever(similarity_top_k=5)

# Sparse Retriever (BM25)
yoruba_collection = client.collections.get("YorubaChunk")
all_docs = [Document(text=obj.properties["text"]) for obj in yoruba_collection.iterator()]
sparse_retriever = BM25Retriever(all_docs, similarity_top_k=5)

# Hybrid Retriever
class HybridRetriever(BaseRetriever):
    def __init__(self, dense_retriever, sparse_retriever, mode="rrf", alpha=0.5, k=60):
        self.dense_retriever = dense_retriever
        self.sparse_retriever = sparse_retriever
        self.mode = mode
        self.alpha = alpha
        self.k = k

    def _retrieve(self, query, **kwargs):
        dense_results = self.dense_retriever.retrieve(query, **kwargs)
        sparse_results = self.sparse_retriever.retrieve(query, **kwargs)

        dense_dict = {r.node.node_id: (r.score, i + 1) for i, r in enumerate(dense_results)}
        sparse_dict = {r.node.node_id: (r.score, i + 1) for i, r in enumerate(sparse_results)}

        all_doc_ids = set(dense_dict.keys()) | set(sparse_dict.keys())
        fused_scores = defaultdict(float)

        if self.mode == "rrf":
            for doc_id in all_doc_ids:
                if doc_id in dense_dict:
                    _, rank = dense_dict[doc_id]
                    fused_scores[doc_id] += 1.0 / (self.k + rank)
                if doc_id in sparse_dict:
                    _, rank = sparse_dict[doc_id]
                    fused_scores[doc_id] += 1.0 / (self.k + rank)
        elif self.mode == "rsf":
            for doc_id in all_doc_ids:
                s_vec = dense_dict.get(doc_id, (0.0, None))[0]
                s_bm25 = sparse_dict.get(doc_id, (0.0, None))[0]
                fused_scores[doc_id] = self.alpha * s_vec + (1 - self.alpha) * s_bm25

        fused_results = []
        for doc_id, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True):
            node = None
            for r in dense_results + sparse_results:
                if r.node.node_id == doc_id:
                    node = r.node
                    break
            fused_results.append(NodeWithScore(node=node, score=score))
        return fused_results

hybrid_retriever = HybridRetriever(dense_retriever, sparse_retriever, mode="rrf")


In [None]:
# ====================================
# Suppress Warnings and Errors (Optional)
# ====================================
import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(level=logging.ERROR)

# For transformers, suppress specific warnings if needed
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

In [None]:
from llama_index.core import PromptTemplate

# Prompt 1 ‚Äî retrieve relevant context
prompt_retrieve = PromptTemplate(
    "·π¢e √†w√°r√≠ √†l√†y√© t√≥ y·∫π j√πl·ªç n√≠pa ak·ªçÃÅl√© y√¨√≠: {topic}\n"
)

# Prompt 2 ‚Äî synthesize Yoruba answer
prompt_answer = PromptTemplate(
    """F√∫n un n√≠ √¨d√°h√πn n√≠ √®d√® Yor√πb√° t√≥ d√° l√≥r√≠ √†l√†y√© y√¨√≠.
J·ªçÃÄw·ªçÃÅ lo √®d√® t√≥ m·ªçÃÅ, ·π£√†l√†y√© d√°ad√°a, k√≠ o s√¨ t·ªçÃÅka s√≠ or√≠sun n√°√†.

√Äl√†y√©:
{context}

√åd√°h√πn:
"""
)



In [None]:


from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from llama_index.core import Settings
import torch
import cohere
from llama_index.llms.gemini import Gemini


# =============================
# 1. HuggingFace Model Loader
# =============================
def load_huggingface_llm(model_name: str):
    """
    Load a HuggingFace model using 4-bit quantization with BitsAndBytes.
    Automatically detects if model is seq2seq or causal.
    Falls back to standard precision if quantization fails.
    """
    from transformers import (
        AutoTokenizer,
        AutoModelForCausalLM,
        AutoModelForSeq2SeqLM,
        BitsAndBytesConfig,
    )
    import torch

    print(f"üîÑ Loading {model_name}...")

    # ‚úÖ Define quantization configuration (stable for CUDA 12+)
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",              # recommended quantization type
        bnb_4bit_compute_dtype=torch.bfloat16,  # use bfloat16 for CUDA 12+
        bnb_4bit_use_double_quant=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Detect model architecture
    if any(x in model_name.lower() for x in ["t5", "mbart", "aya", "afrolm", "bart"]):
        model_loader = AutoModelForSeq2SeqLM
    else:
        model_loader = AutoModelForCausalLM

    try:
        # Attempt 4-bit quantized loading
        model = model_loader.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        )
        print(f"‚úÖ Successfully loaded {model_name} in 4-bit quantized mode.")
    except Exception as e:
        print(f"[WARN] 4-bit quantization failed for {model_name}: {e}")
        print("‚û°Ô∏è Falling back to full precision mode.")
        model = model_loader.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
            trust_remote_code=True
        )

    # Wrap model in HuggingFaceLLM
    llm = HuggingFaceLLM(
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512
    )

    return llm

# =============================
# 2. Gemini Model Loader
# =============================
def load_gemini_llm(api_key: str, model: str = "models/gemini-2.5-flash"):
    """Load Google Gemini model."""
    print(f"üîÑ Loading Gemini: {model}...")
    llm = Gemini(model=model, api_key=api_key)
    print(f"‚úÖ Gemini loaded successfully!")
    return llm


# =============================
# 3. Cohere Setup
# =============================
def setup_cohere_client(api_key: str):
    """Initialize Cohere client."""
    print("üîÑ Setting up Cohere client...")
    co = cohere.ClientV2(api_key=api_key)
    print("‚úÖ Cohere client ready!")
    return co


# =============================
# 3.5 Document Cleaning Functions
# =============================
def clean_retrieved_text(text: str) -> str:
    """
    Clean and normalize retrieved text before passing to generation.
    """
    if not text or not isinstance(text, str):
        return ""

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters that might confuse the model
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)

    # Remove URLs (optional - comment out if URLs are important)
    # text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Remove email addresses (optional)
    # text = re.sub(r'\S+@\S+', '', text)

    # Remove excessive punctuation
    text = re.sub(r'([.!?])\1+', r'\1', text)

    # Trim whitespace
    text = text.strip()

    return text


def filter_and_clean_nodes(nodes, min_length: int = 50, max_length: int = 2000, score_threshold: float = 0.0):
    """
    Filter and clean retrieved nodes based on quality criteria.

    Args:
        nodes: Retrieved nodes from the retriever
        min_length: Minimum character length for a node to be kept
        max_length: Maximum character length (truncate if longer)
        score_threshold: Minimum relevance score (if available)

    Returns:
        List of dicts with cleaned text and score
    """
    cleaned_nodes = []

    for node in nodes:
        # Get the text content
        text = node.text if hasattr(node, 'text') else str(node)

        # Clean the text
        cleaned_text = clean_retrieved_text(text)

        # Skip if too short after cleaning
        if len(cleaned_text) < min_length:
            continue

        # Truncate if too long
        if len(cleaned_text) > max_length:
            cleaned_text = cleaned_text[:max_length] + "..."

        # Check relevance score if available
        score = getattr(node, 'score', None)
        if score is not None and score < score_threshold:
            continue

        # Store cleaned text and score in a dict
        cleaned_nodes.append({'text': cleaned_text, 'score': score})

    return cleaned_nodes


def deduplicate_contexts(nodes):
    """
    Remove duplicate or highly similar contexts.
    Accepts list of dicts with 'text' key.
    """
    seen_texts = set()
    unique_nodes = []

    for node in nodes:
        text = node['text']

        # Simple deduplication based on first 100 characters
        text_signature = text[:100].lower().strip()

        if text_signature not in seen_texts:
            seen_texts.add(text_signature)
            unique_nodes.append(node)

    return unique_nodes


def yoruba_rag_query_llamaindex(topic: str, top_k: int = 3, clean_context: bool = True):
    """
    RAG query using LlamaIndex LLM (HuggingFace or Gemini via Settings.llm).
    Now includes document cleaning before generation.
    """
    # Retrieve relevant documents
    retrieved_nodes = dense_retriever.retrieve(topic)[:top_k]
    print(f"üìö Retrieved {len(retrieved_nodes)} documents")

    # Clean and filter retrieved documents
    if clean_context:
        print("üßπ Cleaning retrieved documents...")
        retrieved_nodes = filter_and_clean_nodes(
            retrieved_nodes,
            min_length=50,
            max_length=2000,
            score_threshold=0.0
        )
        retrieved_nodes = deduplicate_contexts(retrieved_nodes)
        print(f"‚ú® After cleaning: {len(retrieved_nodes)} documents")

    # Concatenate contexts
    context = "\n\n".join([n['text'] for n in retrieved_nodes])
    print(f"üìù Context length: {len(context)} characters")

    # Apply Yoruba synthesis prompt
    full_prompt = prompt_answer.format(context=context)
    print("--- Prompt sent to LLM ---")
    print(full_prompt[:500] + "..." if len(full_prompt) > 500 else full_prompt)

    # Generation configuration for better quality responses
    generation_config = {
        "temperature": 0.7,             # Controls creativity
        "top_p": 0.9,                   # Nucleus sampling
        "top_k": 40,                    # Limits token search space
        "repetition_penalty": 1.15,     # Penalize repeating n-grams
        "max_new_tokens": 256,          # Limit response length
        "no_repeat_ngram_size": 3,      # Avoid loops
        "do_sample": True               # Enables random sampling
    }

    # Call LLM using Settings.llm with generation config
    # Detect Gemini model by class name
    llm_class_name = type(Settings.llm).__name__.lower()
    if "gemini" in llm_class_name:
        response = Settings.llm.complete(full_prompt)
    else:
        response = Settings.llm.complete(full_prompt, **generation_config)


    return response.text


def yoruba_rag_query_cohere(topic: str, cohere_client, top_k: int = 5, clean_context: bool = True):
    """
    RAG query using Cohere API.
    Now includes document cleaning before generation.
    """
    # Retrieve relevant documents
    retrieved_nodes = dense_retriever.retrieve(topic)[:top_k]
    print(f"üìö Retrieved {len(retrieved_nodes)} documents")

    # Clean and filter retrieved documents
    if clean_context:
        print("üßπ Cleaning retrieved documents...")
        retrieved_nodes = filter_and_clean_nodes(
            retrieved_nodes,
            min_length=50,
            max_length=2000,
            score_threshold=0.0
        )
        retrieved_nodes = deduplicate_contexts(retrieved_nodes)
        print(f"‚ú® After cleaning: {len(retrieved_nodes)} documents")

    # Concatenate contexts
    context = "\n\n".join([n['text'] for n in retrieved_nodes])

    # Call Cohere Chat API
    response = cohere_client.chat(
        model="command-a-03-2025",
        messages=[
            {
                "role": "user",
                "content": f"""
                √åb√©√®r√®: {topic}

                √åt√†n √†k·ªçÃÅk·ªçÃÅ (context):
                {context}

                √åd√°h√πn n√≠ √®d√® Yor√πb√°:
                """
            }
        ]
    )

    return response.message.content[0].text

In [None]:
from sentence_transformers import SentenceTransformer, util
# Load your embedding model
similarity_model=SentenceTransformer("BAAI/bge-m3")
from bert_score import score

In [None]:
from typing import List, Union
from sentence_transformers import SentenceTransformer, util
from bert_score import score
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Initialize models
similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
rouge = Rouge()

def evaluate_generation_metrics(
    question: str,
    contexts: Union[str, List[str]],
    answer: str,
    reference: str
):
    """
    Evaluate RAG output quality using BLEU, ROUGE-L, BERTScore, and cosine similarity.
    """
    # Normalize inputs
    if isinstance(contexts, str):
        contexts = [contexts]
    full_context = ' '.join(contexts)

    # --- BLEU ---
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(
        [reference.split()],
        answer.split(),
        smoothing_function=smoothie
    )

    # --- ROUGE-L ---
    rouge_scores = rouge.get_scores(answer, reference, avg=True)
    rouge_l = rouge_scores['rouge-l']['f']

    # --- BERTScore ---
    P, R, F1 = score(
        [answer],
        [reference],
        lang='yo',
        model_type='xlm-roberta-large'
    )
    bert_f1 = F1.mean().item()

    # --- Cosine Similarity (semantic similarity) ---
    ans_emb = similarity_model.encode(answer, convert_to_tensor=True)
    ref_emb = similarity_model.encode(reference, convert_to_tensor=True)
    cosine_sim = util.cos_sim(ans_emb, ref_emb).item()

    # --- Composite Generation Score ---
    composite_gen = round(
        (bleu_score + rouge_l + bert_f1 + cosine_sim) / 4, 3
    )

    return {
        'BLEU': round(bleu_score, 3),
        'ROUGE-L': round(rouge_l, 3),
        'BERTScore_F1': round(bert_f1, 3),
        'Cosine_Similarity': round(cosine_sim, 3),
        'Composite_Gen': composite_gen   # ‚úÖ renamed for pipeline consistency
    }


def evaluate_yoruba_rag(question: str, contexts: Union[str, List[str]], answer: str, reference: str):
    """
    Combined RAG evaluation: context-level and generation-level.
    """
    # Context relevance
    CR = evaluate_context_relevance(question, contexts)
    # Faithfulness
    FG = evaluate_faithfulness(answer, contexts)
    # Generation-level metrics
    gen_scores = evaluate_generation_metrics(question, contexts, answer, reference)

    # Composite
    composite = (CR + FG + gen_scores['Composite_Generation_Score']) / 3

    return {
        'Context_Relevance': round(CR, 3),
        'Faithfulness': round(FG, 3),
        **gen_scores,
        'Composite_RAG_Score': round(composite, 3)
    }


# --- Supporting functions reused from earlier ---
def evaluate_context_relevance(arg1: Union[str, List[str]], arg2: Union[str, List[str]]):
    if isinstance(arg1, str) and not isinstance(arg2, str):
        question = arg1
        contexts = arg2
    elif isinstance(arg2, str) and not isinstance(arg1, str):
        question = arg2
        contexts = arg1
    else:
        question = arg1
        contexts = arg2

    if isinstance(contexts, str):
        contexts = [contexts]

    if len(contexts) == 0:
        return 0.0

    question_emb = similarity_model.encode(question, convert_to_tensor=True)
    doc_embs = similarity_model.encode(contexts, convert_to_tensor=True)
    sims = util.cos_sim(question_emb, doc_embs)
    return float(sims.mean().item())


def evaluate_faithfulness(answer: str, context: Union[str, List[str]]):
    if isinstance(context, str):
        context = [context]
    ctx_emb = similarity_model.encode(' '.join(context), convert_to_tensor=True)
    ans_emb = similarity_model.encode(answer, convert_to_tensor=True)
    cosine_sim = util.cos_sim(ctx_emb, ans_emb).item()
    return float(cosine_sim)

def evaluate_answer_relevance(question: str, answer: str):
    # bert-score expects lists of references & candidates
    P, R, F1 = score([answer], [question], lang='yo', model_type='xlm-roberta-large')
    return float(F1.mean().item())

In [None]:
from sklearn.cluster import KMeans
def cluster_sample(df, domain_col="domain", n_per_domain=5):
    """
    Cluster-based sampling: use embeddings of questions to select 10 diverse samples per domain.
    """
    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    sampled = []

    for domain, group in df.groupby(domain_col):
        group = group.dropna(subset=["question"])
        if len(group) <= n_per_domain:
            sampled.append(group)
            continue

        embeddings = embedder.encode(group["question"].tolist())
        kmeans = KMeans(n_clusters=n_per_domain, random_state=42)
        clusters = kmeans.fit_predict(embeddings)
        group["cluster"] = clusters

        cluster_sampled = (
            group.groupby("cluster")
            .apply(lambda x: x.sample(1, random_state=42))
            .reset_index(drop=True)
        )
        sampled.append(cluster_sampled)

    return pd.concat(sampled, ignore_index=True)

In [None]:
# Suppose you already have your context documents (from ground_truth.csv)
df = pd.read_csv("/content/ground_truth.csv")

In [None]:
 sampled_df = cluster_sample(df, n_per_domain=2)

In [None]:
 # ====================================
# ‚ö° Optimized Yoruba RAG Quantitative Evaluation Across 7 Models
# ====================================
import gc
import re
import time
from tqdm import tqdm
from functools import lru_cache
import pandas as pd
import torch


# ====================================
# üßπ Utility Functions
# ====================================
def clean_text(text, max_len=512):
    """Normalize Yoruba text and truncate to avoid long context embedding."""
    text = re.sub(r'\s+', ' ', text.strip())
    return text[:max_len]


def free_memory():
    """Force garbage collection and clear GPU memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def get_free_gpu_memory():
    """Return free GPU memory (in GB) for device 0."""
    if not torch.cuda.is_available():
        return 0
    gpu_stats = torch.cuda.mem_get_info()
    free_mem_gb = gpu_stats[0] / 1024**3
    return round(free_mem_gb, 2)

# ====================================
# ‚öôÔ∏è Model Setup (Cached)
# ====================================
@lru_cache(maxsize=8)
def set_model(model_name):
    """Load or cache model/LLM client."""
    from llama_index.core import Settings
    free_mem = get_free_gpu_memory()
    print(f"üß† Detected free GPU memory: {free_mem} GB")

    if model_name == "gemini":
        llm = load_gemini_llm(api_key="*******************************")
        Settings.llm = llm
        return llm

    elif model_name == "cohere":
        global cohere_client
        cohere_client = setup_cohere_client(api_key="*******************************")
        return None

    else:
        llm = load_huggingface_llm(model_name)
        Settings.llm = llm
        return llm


# ====================================
# üìä Evaluation Configuration
# ====================================
model_names = [
    "gemini" , # Gemini API
    "cohere",  # Cohere API
"mistralai/Mistral-7B-Instruct-v0.3"
    #"meta-llama/Llama-3.2-1B-Instruct",
   # "Qwen/Qwen2.5-7B-Instruct",
    #"bigscience/bloomz-7b1",
    #"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"


]

from huggingface_hub import login

# Paste your HuggingFace access token here
login("hf_token_api_key")



In [None]:
# ====================================
# üöÄ Evaluation Pipeline
# ====================================
all_results = []
start_all = time.time()

for model_name in model_names:
    print(f"\nüß† Evaluating model: {model_name}")
    model_start = time.time()

    # Load model efficiently
    llm = set_model(model_name)
    free_memory()

    for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc=f"{model_name}"):
        domain = row["domain"]
        question = clean_text(row["question"])
        reference = clean_text(row["reference_answer"])
        context_doc = clean_text(row["context_document"])

        try:
            # üîπ Generate answer
            if model_name == "cohere":
                answer = yoruba_rag_query_cohere(question, cohere_client, top_k=3)
            else:
                answer = yoruba_rag_query_llamaindex(question, top_k=3)

            # üîπ Metric computation
            context_score = evaluate_context_relevance(question, [context_doc])
            faith_score = evaluate_faithfulness(answer, [context_doc])
            relevance_score = evaluate_answer_relevance(question, answer)
            gen_metrics = evaluate_generation_metrics(question=question,
                contexts=[context_doc],
                answer=answer,
                reference=reference)

            composite_rag = round(
                (context_score + faith_score + relevance_score + gen_metrics["Composite_Gen"]) / 4, 3
            )
            composite_gen = gen_metrics.get("Composite_Gen", 0)
            composite_rag = round((context_score + faith_score + relevance_score + composite_gen) / 4, 3)
            print(relevance_score)
            print(context_score)
            print(relevance_score)
            print(gen_metrics)

            all_results.append({
                "model": model_name,
                "domain": domain,
                "question": question,
                "reference": reference,
                "generated_answer": answer,
                "context_relevance": context_score,
                "faithfulness": faith_score,
                "answer_relevance": relevance_score,
                **gen_metrics,
                "Composite_RAG_Score": composite_rag
            })

        except Exception as e:
            print(f"[WARN] Skipped item: {e}")
            continue

    print(f"‚úÖ Completed {model_name} in {round(time.time() - model_start, 2)} sec")
    free_memory()

# ====================================
# üßæ Aggregate Results
# ====================================
df_all = pd.DataFrame(all_results)

if df_all.empty:
    print("\n‚ö†Ô∏è No evaluation results were recorded ‚Äî check generation or metric functions.")
else:
    print("\nüìä Sample of Evaluation Results:")
    print(df_all.head())

    # Model-wise summary (average metrics)
    summary = (
        df_all.groupby("model")[["context_relevance", "faithfulness", "Composite_RAG_Score"]]
        .mean()
        .sort_values("Composite_RAG_Score", ascending=False)
    )

    print("\nüìà Yoruba RAG Evaluation Summary:")
    print(summary)

print(f"\n‚è±Ô∏è Total Evaluation Time: {round(time.time() - start_all, 2)} sec")
