In [None]:
# Import libraries
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import ndcg_score

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load KaLM multilingual embedding model
model_name = "HIT-TMG/KaLM-embedding-multilingual-mini-v1"
model = SentenceTransformer(model_name).to(device)
# Define function to get sentence embeddings
def get_embedding(texts):
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

# Compute Recall@100 and NDCG@10 for a single language
def evaluate_miracl(dataset):
    print(f"\nEvaluating language: {lang}...")

    # Extract queries and positive passages
    queries = dataset["query"][:200]  # Extract the "query" field for the first 200 rows
    positive_passages = [
        passage[0] if len(passage) > 0 else None
        for passage in dataset["positive_passages"][:5000]  # Extract the first positive passage
    ]

    # Filter out rows where positive_passages is None
    valid_data = [(q, p) for q, p in zip(queries, positive_passages) if p is not None]
    if len(valid_data) == 0:
        print(f"No valid data found for {lang}")
        return 0, 0

    # Separate queries and passages
    queries, positive_passages = zip(*valid_data)

    # Get embeddings
    query_embeddings = get_embedding(list(queries))
    passage_embeddings = get_embedding(list(positive_passages))

    # Build FAISS index
    dim = query_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(passage_embeddings)

    # Retrieve top-k results
    k_recall = 100
    k_ndcg = 10
    D, I = index.search(query_embeddings, max(k_recall, k_ndcg))

    # Compute Recall@100
    recall_count = sum(
        [1 if idx in I[i][:k_recall] else 0 for i, idx in enumerate(range(len(queries)))]
    )
    recall_100 = recall_count / len(queries)

    # Compute NDCG@10
    relevance = np.zeros((len(queries), k_ndcg))
    for i in range(len(queries)):
        relevance[i, 0] = 1  # Assume the top-ranked result is relevant
    ndcg_10 = np.mean(
        [ndcg_score([rel], [rank]) for rel, rank in zip(relevance, I[:, :k_ndcg])]
    )

    print(f"{lang} - Recall@100: {recall_100:.4f}, NDCG@10: {ndcg_10:.4f}")
    return recall_100, ndcg_10

In [None]:
# Define the subset of languages to evaluate (from "ko" to "ru")
languages = ["ar", "bn", "en", "es", "fa", "fi", "fr", "hi", "id", "ja", "ko", "ru", "sw", "te", "th", "zh"]
# Dictionary to store results
results = {}
# Iterate over each language and evaluate
for lang in languages:
    print(f"\nProcessing language: {lang}")
    dataset = load_dataset("miracl/miracl", lang, split="dev")  # Load dataset
    recall, ndcg = evaluate_miracl(dataset)  # Evaluate
    results[lang] = {"Recall@100": recall, "nDCG@10": ndcg}