In [None]:
def evaluate_miracl(dataset):
    print(f"\nEvaluating language: {lang}...")

    # Extract queries
    queries = dataset["query"][:200]  # Take first 200 queries
    query_ids = dataset["query_id"][:200]

    # Extract positive and negative passages
    all_positive_passages = [p[0] if p else None for p in dataset["positive_passages"]]
    all_negative_passages = [n[0] if n else None for n in dataset["negative_passages"]] 

    # Combine positive + negative passages to form the full corpus
    corpus = list(set(all_positive_passages + all_negative_passages))  # Remove duplicates
    corpus = [p for p in corpus if p is not None]  # Remove None values

    if not queries or not corpus:
        print(f"No valid data found for {lang}")
        return 0, 0

    # Create mapping of query → relevant positive passages
    query_to_positive = {
        q: set(p) for q, p in zip(dataset["query"], dataset["positive_passages"]) if p
    }
    
    # Embed queries
    query_embeddings = get_embedding(queries)

    # Embed the full corpus
    corpus_embeddings = get_embedding(corpus)

    # Build FAISS index for full corpus
    dim = corpus_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(corpus_embeddings)

    # Retrieve top-k results from the full corpus
    k_recall, k_ndcg = 100, 10
    D, I = index.search(query_embeddings, max(k_recall, k_ndcg))

    # Extract positive passage sets for each query
    query_to_positive = {
        q: set(p) for q, p in zip(dataset["query"], dataset["positive_passages"]) if p
    }

    # Compute Recall@100
    recall_count = sum(
        1 for i, query in enumerate(queries)
        if query_to_positive.get(query, set()) & {corpus[idx] for idx in I[i][:k_recall]}
    )
    recall_100 = recall_count / len(queries)
    
    # Compute NDCG@10
    true_relevance = np.zeros((len(queries), k_ndcg))
    for i, query in enumerate(queries):
        relevant_docs = query_to_positive.get(query, set())
        for j, retrieved_idx in enumerate(I[i][:k_ndcg]):
            if corpus[retrieved_idx] in relevant_docs:
                true_relevance[i, j] = 1  # Assign relevance score
    
    ndcg_10 = np.mean([
        ndcg_score([true_relevance[i]], [I[i][:k_ndcg]]) for i in range(len(queries))
    ])
    
    print(f"{lang} - Recall@100: {recall_100:.4f}, NDCG@10: {ndcg_10:.4f}")
    return recall_100, ndcg_10

In [None]:
from datasets import load_dataset
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import ndcg_score

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load KaLM multilingual embedding model
model_name = "HIT-TMG/KaLM-embedding-multilingual-mini-v1"
model = SentenceTransformer(model_name).to(device)

# Define function to get sentence embeddings
def get_embedding(texts):
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

# Load the MIRACL Yoruba (yo) corpus
miracl_corpus = load_dataset("miracl/miracl-corpus", "yo")  # Load Yoruba split

# Extract passages from the corpus (FAISS index will be built on these)
corpus = miracl_corpus["train"]["passage"]  # The full set of passages in Yoruba

# Load queries and positive passages for Yoruba from MIRACL Query Dataset
miracl_queries = load_dataset("miracl/miracl", "yo")  # Load Yoruba query dataset

def evaluate_miracl(dataset):
    print("\nEvaluating Yoruba (yo) Language...")

    # Extract queries and corresponding positive passages
    num_queries = min(1000, len(dataset["query"]))  # Use up to 1000 queries
    queries = dataset["query"][:num_queries]

    # Extract positive passages mapping
    query_to_positive = {
        q: set(p) for q, p in zip(dataset["query"], dataset["positive_passages"]) if p
    }

    if not queries or not corpus:
        print("No valid data found for Yoruba (yo)")
        return 0, 0

    # Generate embeddings for queries and full corpus
    query_embeddings = get_embedding(queries)
    corpus_embeddings = get_embedding(corpus)

    # Build FAISS index using full Yoruba corpus
    dim = corpus_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(corpus_embeddings)

    # Retrieve top-k results
    k_recall, k_ndcg = 100, 10
    D, I = index.search(query_embeddings, max(k_recall, k_ndcg))

    # Compute Recall@100
    recall_count = 0
    for i, query in enumerate(queries):
        retrieved_docs = {corpus[idx] for idx in I[i][:k_recall]}  # Get top-k retrieved passages
        relevant_docs = query_to_positive.get(query, set())  # Get actual relevant passages

        if len(relevant_docs & retrieved_docs) > 0:  # Check if any relevant passage is retrieved
            recall_count += 1

    recall_100 = recall_count / len(queries)

    # Compute NDCG@10
    true_relevance = np.zeros((len(queries), k_ndcg))
    for i, query in enumerate(queries):
        relevant_docs = query_to_positive.get(query, set())

        for j, retrieved_idx in enumerate(I[i][:k_ndcg]):
            if corpus[retrieved_idx] in relevant_docs:
                true_relevance[i, j] = 1  # Assign relevance score

    ndcg_10 = np.mean([ndcg_score([rel], [rank]) for rel, rank in zip(true_relevance, I[:, :k_ndcg])])

    print(f"Yoruba (yo) - Recall@100: {recall_100:.4f}, NDCG@10: {ndcg_10:.4f}")
    return recall_100, ndcg_10


# Run evaluation on Yoruba queries
evaluate_miracl(miracl_queries["dev"])

from datasets import load_dataset
import torch
import faiss
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import ndcg_score

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load mContriever model from Hugging Face
model_name = "facebook/mcontriever-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Define function to get sentence embeddings using mContriever
def get_embedding(texts, batch_size=16):
    """
    Compute embeddings using Facebook's mContriever model.
    - Splits texts into batches for efficient processing.
    """
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs).last_hidden_state  # Extract last hidden state
            batch_embeddings = outputs[:, 0, :].cpu().numpy()  # Use CLS token embeddings
            embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)  # Stack all batch embeddings together


# Load the MIRACL Yoruba (yo) corpus
miracl_corpus = load_dataset("miracl/miracl-corpus", "yo")  # Load Yoruba split

# Extract passages from the corpus (FAISS index will be built on these)
corpus = miracl_corpus["dev"]["passage"]  # The full set of passages in Yoruba

# Load queries and positive passages for Yoruba from MIRACL Query Dataset
miracl_queries = load_dataset("miracl/miracl", "yo")  # Load Yoruba query dataset

def evaluate_miracl(dataset):
    print("\nEvaluating Yoruba (yo) Language using mContriever...")

    # Extract queries and corresponding positive passages
    num_queries = min(1000, len(dataset["query"]))  # Use up to 1000 queries
    queries = dataset["query"][:num_queries]

    # Extract positive passages mapping
    query_to_positive = {
        q: set(p) for q, p in zip(dataset["query"], dataset["positive_passages"]) if p
    }

    if not queries or not corpus:
        print("No valid data found for Yoruba (yo)")
        return 0, 0

    # Generate embeddings for queries and full corpus using mContriever
    query_embeddings = get_embedding(queries)
    corpus_embeddings = get_embedding(corpus)

    # Build FAISS index using full Yoruba corpus
    dim = corpus_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(corpus_embeddings)

    # Retrieve top-k results
    k_recall, k_ndcg = 100, 10
    D, I = index.search(query_embeddings, max(k_recall, k_ndcg))

    # Compute Recall@100
    recall_count = 0
    for i, query in enumerate(queries):
        retrieved_docs = {corpus[idx] for idx in I[i][:k_recall]}  # Get top-k retrieved passages
        relevant_docs = query_to_positive.get(query, set())  # Get actual relevant passages

        if len(relevant_docs & retrieved_docs) > 0:  # Check if any relevant passage is retrieved
            recall_count += 1

    recall_100 = recall_count / len(queries)

    # Compute NDCG@10
    true_relevance = np.zeros((len(queries), k_ndcg))
    for i, query in enumerate(queries):
        relevant_docs = query_to_positive.get(query, set())

        for j, retrieved_idx in enumerate(I[i][:k_ndcg]):
            if corpus[retrieved_idx] in relevant_docs:
                true_relevance[i, j] = 1  # Assign relevance score

    ndcg_10 = np.mean([ndcg_score([rel], [rank]) for rel, rank in zip(true_relevance, I[:, :k_ndcg])])

    print(f"Yoruba (yo) - Recall@100: {recall_100:.4f}, NDCG@10: {ndcg_10:.4f}")
    return recall_100, ndcg_10


# Run evaluation on Yoruba queries using mContriever
evaluate_miracl(miracl_queries["dev"])