In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip install underthesea
!pip install faiss-cpu
!pip install nltk --upgrade --force-reinstall
!pip install sentence-transformers
!pip install transformers einops
!pip install -U docarray==0.21.0

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.2.0-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (792 kB)
Using cached click-8.2.0-py3-none-any.whl (102 kB)
Using cached joblib-1.5.0-py3-none-any.whl (307 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.67.1
    Uninstalling tqdm-4.67.1:
      Successfully uninstalled tqdm-

BM25

In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
import json
from collections import Counter
from underthesea import word_tokenize

# Load BM25 components
tf_matrix = load_npz("/content/drive/MyDrive/Data-New/BM25/bm25_tf.npz")
with open("/content/drive/MyDrive/Data-New/BM25/bm25_vocab_index.json", "r") as f:
    word2idx = json.load(f)

num_docs = tf_matrix.shape[0]

def compute_idf(tf_matrix, num_docs):
    doc_freq = np.bincount(tf_matrix.nonzero()[1], minlength=tf_matrix.shape[1])
    idf = np.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)
    return idf

def compute_doc_lengths(tf_matrix):
    return np.array(tf_matrix.sum(axis=1)).flatten()

def bm25_score(query_tokens, tf_matrix, idf, doc_lengths, avg_doc_len, word2idx, k1=1.5, b=0.75):
    scores = np.zeros(tf_matrix.shape[0])
    query_counts = Counter(query_tokens)
    for term, tf_query in query_counts.items():
        if term in word2idx:
            term_idx = word2idx[term]
            term_tf = tf_matrix[:, term_idx].toarray().flatten()
            numerator = term_tf * (k1 + 1)
            denominator = term_tf + k1 * (1 - b + b * doc_lengths / avg_doc_len)
            scores += idf[term_idx] * numerator / denominator
    return scores

# Precompute IDF and doc lengths
idf = compute_idf(tf_matrix, num_docs)
doc_lengths = compute_doc_lengths(tf_matrix)
avg_doc_len = np.mean(doc_lengths)

# Load corpus and map cid to tokenized text
df_corpus = pd.read_csv("/content/drive/MyDrive/Data-New/corpus_tokenized.csv", usecols=["cid", "context_tokenized"])
cid_to_index = {cid: idx for idx, cid in enumerate(df_corpus['cid'].drop_duplicates())}
index_to_cid = {idx: cid for cid, idx in cid_to_index.items()}
cid_to_text = df_corpus.drop_duplicates("cid").set_index("cid")["context_tokenized"].to_dict()
del df_corpus

# ---------- BM25 Inference Function ----------

def get_relevant_bm25(question_text, top_k=20):
    query_tokens = word_tokenize(question_text, format="text").split()
    scores = bm25_score(query_tokens, tf_matrix, idf, doc_lengths, avg_doc_len, word2idx)
    top_indices = np.argsort(-scores)[:top_k]

    results = []
    for rank, idx in enumerate(top_indices):
        cid = index_to_cid.get(idx, -1)
        if cid != -1:
            results.append({
                "rank": rank + 1,
                "cid": cid,
                "score": float(scores[idx]),
                "context": cid_to_text.get(cid, "Not found")
            })
    return results


FAISS

In [4]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from underthesea import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

# File paths based on the directory structure in the image
MODEL_PATH = '/content/drive/MyDrive/Data-New/FAISS/finetuned_model'  # Use the final model or any epoch model
CORPUS_PATH = '/content/drive/MyDrive/Data-New/corpus_tokenized.csv'
EMBEDDINGS_PATH = '/content/drive/MyDrive/Data-New/FAISS/corpus_embeddings.npy'
ID_MAP_PATH = '/content/drive/MyDrive/Data-New/FAISS/corpus_id_mapping.csv'

# Load the fine-tuned model
print("Loading model...")
model = SentenceTransformer(MODEL_PATH)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# Load corpus
print("Loading corpus...")
corpus_df = pd.read_csv(CORPUS_PATH)
corpus_df['context_tokenized'] = corpus_df['context_tokenized'].astype(str)

# Load pre-computed corpus embeddings and IDs
print("Loading embeddings and ID mapping...")
corpus_embeddings = np.load(EMBEDDINGS_PATH)
id_df = pd.read_csv(ID_MAP_PATH)
corpus_ids = id_df['cid'].tolist()

# Function to perform inference
def get_relevant_faiss(query, top_k=20):
    # Tokenize the query using underthesea
    tokenized_query = word_tokenize(query)
    if isinstance(tokenized_query, list):
        tokenized_query = ' '.join(tokenized_query)
    print(f"Tokenized query: {tokenized_query}")

    # Encode the tokenized query
    query_embedding = model.encode(tokenized_query, convert_to_numpy=True, device=device)

    # Calculate similarity with all corpus documents
    similarities = cosine_similarity([query_embedding], corpus_embeddings)[0]

    # Get indices of top k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]

    # Return the top k most similar documents
    results = []
    for idx in top_indices:
        cid = corpus_ids[idx]
        # Find the corresponding document in corpus_df
        doc_text = corpus_df[corpus_df['cid'] == cid]['context_tokenized'].values[0]
        results.append({
            'cid': cid,
            'similarity': similarities[idx],
            'text': doc_text
        })

    return results


Loading model...
Loading corpus...
Loading embeddings and ID mapping...


Dense E5

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import re
import math

def clean_text(text):
    cleaned_text = re.sub(r'[\xa0\xad]+', ' ', text)
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
    cleaned_text = re.sub(r'\…+', '', cleaned_text)
    cleaned_text = re.sub(r'[@#%^&*]+', '', cleaned_text)
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r'-{2,}', '', cleaned_text)
    cleaned_text = re.sub(r'_{4,}', '', cleaned_text)
    cleaned_text = re.sub(r':\.*', '', cleaned_text)
    return cleaned_text.strip()

# Thiết lập device và load dữ liệu
device = "cuda" if torch.cuda.is_available() else "cpu"
df = pd.read_csv('/content/drive/MyDrive/Weight/Dataset/cleaned_corpus.csv')
texts = df['text'].astype(str).tolist()
texts = [f"passage: {clean_text(t)}" for t in texts]

# Load model fine-tuned

MODEL_PATH = 'Khue-0408/e5_full_hard_neg'
model = SentenceTransformer(MODEL_PATH, device=device)

# Mình sẽ encode theo batch và dùng tqdm để theo dõi
batch_size = 8
num_texts = len(texts)
num_batches = math.ceil(num_texts / batch_size)
embedding_dim = model.get_sentence_embedding_dimension()

# Chuẩn bị mảng lưu kết quả
all_embeddings = np.zeros((num_texts, embedding_dim), dtype=float)

print(f"Encoding {num_texts} passages in {num_batches} batches (batch_size={batch_size})...")
for batch_idx in tqdm(range(num_batches), desc="Encoding batches"):
    start = batch_idx * batch_size
    end   = min(start + batch_size, num_texts)
    batch_texts = texts[start:end]
    # encode batch (tắt progress bar bên trong)
    batch_emb = model.encode(
        batch_texts,
        show_progress_bar=False,
        convert_to_numpy=True
    )
    all_embeddings[start:end] = batch_emb

# Lưu embeddings mới
out_path = '/content/drive/MyDrive/Weight/Dataset/corpus_embeddings_e5_new.npy'
np.save(out_path, all_embeddings)
print(f"Saved embeddings to {out_path}")



In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import re
import pandas as pd

# Function to clean text (giữ nguyên từ code bạn cung cấp)
def clean_text(text: str) -> str:
    """
    Basic cleanup: remove weird whitespace/chars, collapse dots, strip.
    """
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[\xa0\xad]+', ' ', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\…+', '', text)
    text = re.sub(r'[@#%^&*]+', '', text)
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'-{2,}', '', text)
    text = re.sub(r'_{4,}', '', text)
    text = re.sub(r':\.*', '', text)
    return text.strip()

# Đường dẫn và biến toàn cục
MODEL_PATH = 'Khue-0408/e5_full_hard_neg'
EMBEDDINGS_NPY = '/content/drive/MyDrive/Data-New/corpus_embeddings_e5_new.npy'
CORPUS_META_CSV = '/content/drive/MyDrive/Weight/Dataset/corpus.csv'
CLEANED_CORPUS_CSV = '/content/drive/MyDrive/Weight/Dataset/cleaned_corpus.csv'

# Load model, embeddings và metadata một lần
print("Loading E5 resources...")
e5_model = SentenceTransformer(MODEL_PATH, device="cuda" if torch.cuda.is_available() else "cpu")

# Load corpus embeddings
print("Loading corpus embeddings...")
e5_emb = np.load(EMBEDDINGS_NPY)
e5_emb_tensor = torch.from_numpy(e5_emb).to(e5_model.device)

# Load corpus metadata và text
print("Loading corpus metadata...")
e5_corpus_meta = pd.read_csv(CORPUS_META_CSV)
e5_cids = e5_corpus_meta['cid'].astype(str).tolist()

# Nếu cần truy cập nội dung văn bản
print("Loading corpus text...")
e5_corpus_text = pd.read_csv(CLEANED_CORPUS_CSV)
e5_cid_to_text = {}

# Kiểm tra xem có cột 'cid' trong corpus_text không
if 'cid' in e5_corpus_text.columns:
    e5_cid_to_text = e5_corpus_text.set_index('cid')['text'].to_dict()
else:
    # Giả sử cid và corpus_text có cùng thứ tự
    for i, cid in enumerate(e5_cids):
        if i < len(e5_corpus_text):
            e5_cid_to_text[cid] = e5_corpus_text.iloc[i]['text']

def get_relevant_e5(question_text, top_k=20):
    # Clean và chuẩn bị query theo định dạng của E5
    cleaned_query = clean_text(question_text.lower())
    query_text = f"query: {cleaned_query}"

    print(f"E5 processing query: {query_text}")

    # Encode query
    with torch.no_grad():
        query_embedding = e5_model.encode(
            query_text,
            batch_size=1,
            show_progress_bar=False,
            convert_to_tensor=True,
        )

        if not isinstance(query_embedding, torch.Tensor):
            query_embedding = torch.tensor(query_embedding, device=e5_model.device)

        # Đảm bảo đúng shape
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.unsqueeze(0)

    # Tính độ tương đồng cosine
    similarities = F.cosine_similarity(query_embedding, e5_emb_tensor, dim=1)

    # Lấy top_k indices
    top_indices = torch.argsort(similarities, descending=True)[:top_k].cpu().numpy()

    # Thu thập kết quả
    results = []
    for i, idx in enumerate(top_indices):
        cid = e5_cids[idx]
        score = similarities[idx].item()

        # Lấy text tương ứng nếu có
        text = e5_cid_to_text.get(cid, "Text not found")

        results.append({
            'rank': i + 1,
            'cid': cid,
            'similarity': score,
            'text': text
        })

    return results

In [None]:
#Dense

import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import re

def clean_text(text: str) -> str:
    """
    Basic cleanup: remove weird whitespace/chars, collapse dots, strip.
    """
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[\xa0\xad]+', ' ', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\…+', '', text)
    text = re.sub(r'[@#%^&*]+', '', text)
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'-{2,}', '', text)
    text = re.sub(r'_{4,}', '', text)
    text = re.sub(r':\.*', '', text)
    return text.strip()

# 1) Cấu hình paths
CLEANED_CORPUS_CSV = '/content/drive/MyDrive/Weight/Dataset/cleaned_corpus.csv'      # chứa cột 'text'
CORPUS_META_CSV   = '/content/drive/MyDrive/Weight/Dataset/corpus.csv'              # chứa cột 'cid'
EMBEDDINGS_NPY     = '/content/drive/MyDrive/Data-New/corpus_embeddings_e5_new.npy'
TEST_SET_CSV       = '/content/drive/MyDrive/Data-New/test_set.csv'           # chứa cột 'question' và 'cid'

# 2) Load SentenceTransformer
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = 'Khue-0408/e5_full_hard_neg'
model = SentenceTransformer(MODEL_PATH, device=device)

# 3) Load & prepare corpus embeddings
print("Loading corpus embeddings...")
emb = np.load(EMBEDDINGS_NPY)                     # shape [N_docs, D]
emb_tensor = torch.from_numpy(emb).to(device)     # move to GPU if avail

# 4) Load corpus metadata (for mapping indices → cid)
corpus_meta = pd.read_csv(CORPUS_META_CSV)
cids = corpus_meta['cid'].astype(str).tolist()

# 5) Load test set (with ground truth)
test_df = pd.read_csv(TEST_SET_CSV)
questions = test_df['question'].astype(str).tolist()
questions = [f"query: {clean_text(q.lower())}" for q in questions]
truth_cids = test_df['cid'].astype(str).tolist()

# 6) Encode all test queries
print("Encoding test queries...")
ques_emb = model.encode(
    questions,
    batch_size=4,
    show_progress_bar=True,
    convert_to_numpy=True
)   # shape [N_test, D]
ques_tensor = torch.from_numpy(ques_emb).to(device)

# 7) Retrieval + Metrics
k_values = [5, 10, 20]
reciprocal_ranks = []
correct_at_k = {k: 0 for k in k_values}
total_q = len(questions)

print("Retrieving and computing metrics...")
for i in tqdm(range(total_q), desc="Eval E5"):
    q_vec = ques_tensor[i].unsqueeze(0)                # [1, D]
    sims = F.cosine_similarity(q_vec, emb_tensor, dim=1)  # [N_docs]
    ranked_idx = torch.argsort(sims, descending=True).cpu().numpy()

    retrieved_cids = [cids[idx] for idx in ranked_idx[:max(k_values)]]
    truth = truth_cids[i]

    if truth in retrieved_cids:
        rank = retrieved_cids.index(truth) + 1
        reciprocal_ranks.append(1.0 / rank)
        for k in k_values:
            if rank <= k:
                correct_at_k[k] += 1
    else:
        reciprocal_ranks.append(0.0)

# 8) Tính và in kết quả
mrr = float(np.mean(reciprocal_ranks))
acc_at_k = {k: correct_at_k[k] / total_q * 100 for k in k_values}

print("\n=== E5 Dense Retrieval Evaluation ===")
print(f"Queries evaluated: {total_q}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
for k in k_values:
    print(f"Accuracy@{k}: {acc_at_k[k]:.2f}%")


HYBRID MODEL WITH RERANKER


In [6]:
def combine_top_answers(query: str, top_k_each: int = 20):
    # 1. Call each branch
    bm25_results  = get_relevant_bm25(query, top_k_each)
    faiss_results = get_relevant_faiss(query, top_k_each)
    e5_results    = get_relevant_e5(query, top_k_each)

    # 2. Merge by cid
    merged = {}
    for branch_name, results in [
        ('bm25',  bm25_results),
        ('faiss', faiss_results),
        ('e5',    e5_results),
    ]:
        for item in results:
            cid = str(item['cid'])
            score = item.get('score', item.get('similarity'))
            text  = item.get('context') or item.get('text')

            if cid not in merged:
                merged[cid] = {
                    'cid': cid,
                    'text': text,
                    'scores': {'bm25': None, 'faiss': None, 'e5': None}
                }
            merged[cid]['scores'][branch_name] = score

    # 3. Compute best_score and flatten
    combined_list = []
    for entry in merged.values():
        # pick the highest non-None score
        valid_scores = [s for s in entry['scores'].values() if s is not None]
        entry['best_score'] = max(valid_scores) if valid_scores else 0.0
        combined_list.append(entry)

    # 4. Sort by best_score descending
    combined_list.sort(key=lambda x: x['best_score'], reverse=True)

    return combined_list


In [7]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load reranker model & tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
reranker = AutoModelForSequenceClassification.from_pretrained(
    "jinaai/jina-reranker-v2-base-multilingual",
    torch_dtype="auto",
    trust_remote_code=True,
).to(device)
tokenizer = AutoTokenizer.from_pretrained(
    "jinaai/jina-reranker-v2-base-multilingual",
    trust_remote_code=True,
)

def rerank(query: str, candidates: list, top_k: int = 20):
    texts = [c["text"] for c in candidates]
    # Tokenize query-passage pairs
    inputs = tokenizer(
        [query] * len(texts),
        texts,
        truncation=True,
        padding="longest",
        return_tensors="pt"
    ).to(device)

    # Compute relevance scores
    with torch.no_grad():
        outputs = reranker(**inputs)
        logits = outputs.logits
        if logits.dim() == 2 and logits.size(1) == 1:
            scores = logits.squeeze(1)
        else:
            scores = logits.squeeze()

    # Attach rerank scores
    reranked = []
    for cand, score in zip(candidates, scores.cpu().tolist()):
        entry = cand.copy()
        entry["rerank_score"] = float(score)
        reranked.append(entry)

    # Sort by rerank_score descending and return top_k
    reranked.sort(key=lambda x: x["rerank_score"], reverse=True)
    return reranked[:top_k]


config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_xlm_roberta.py:   0%|          | 0.00/43.8k [00:00<?, ?B/s]

embedding.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/9.82k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py:   0%|          | 0.00/28.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual:
- modeling_xlm_roberta.py
- embedding.py
- xlm_padding.py
- mha.py
- mlp.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/557M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [8]:
def example_query():
    query = "nội dung lồng ghép vấn đề bình đẳng giới trong xây dựng văn bản quy phạm pháp luật được quy định thế nào"
    k = 20

    # Step 1: combine top-k from BM25, FAISS, and E5
    combined = combine_top_answers(query, top_k_each=k)
    print(f"Combined {len(combined)} candidates from three branches.\n")

    # Step 2: rerank the combined candidates
    final_results = rerank(query, combined, top_k=k)
    print(f"Top {k} reranked results:\n")

    # Display results
    for res in final_results:
        cid = res["cid"]
        rerank_score = res["rerank_score"]
        print(f"CID: {cid} | Rerank Score: {rerank_score:.4f}")
        print(f"Text snippet: {res['text'][:200]}...\n")

example_query()


Tokenized query: nội dung lồng ghép vấn đề bình đẳng giới trong xây dựng văn bản quy phạm pháp luật được quy định thế nào
E5 processing query: query: nội dung lồng ghép vấn đề bình đẳng giới trong xây dựng văn bản quy phạm pháp luật được quy định thế nào
Combined 43 candidates from three branches.

Top 20 reranked results:

CID: 53914 | Rerank Score: 0.7734
Text snippet: điều lồng_ghép vấn_đề bình_đẳng giới trong xây_dựng văn_bản quy_phạm_pháp_luật lồng_ghép vấn_đề bình_đẳng giới trong xây_dựng văn_bản quy_phạm_pháp_luật bao_gồm xác_định vấn_đề giới và các biện_pháp g...

CID: 26108 | Rerank Score: 0.5508
Text snippet: mục_đích lồng_ghép vấn_đề bình_đẳng giới trong xây_dựng văn_bản quy_phạm_pháp_luật lồng_ghép vấn_đề bình_đẳng giới trong xây_dựng văn_bản quy_phạm_pháp_luật sau đây gọi tắt là văn_bản là một biện_pháp...

CID: 26109 | Rerank Score: 0.5234
Text snippet: nguyên_tắc lồng_ghép vấn_đề bình_đẳng giới trong xây_dựng văn_bản quy_phạm_pháp_luật lồng_ghép vấn_đề bình_đẳng giới đượ

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

TEST_SET_CSV = '/content/drive/MyDrive/Data-New/test_set.csv'

test_df = pd.read_csv(TEST_SET_CSV)
questions = test_df['question'].astype(str).tolist()
truth_cids = test_df['cid'].astype(str).tolist()

k_values = [5, 10, 20]
reciprocal_ranks = []
correct_at_k = {k: 0 for k in k_values}
total_q = len(questions)

print("Evaluating hybrid pipeline with reranker...")
for q, truth in tqdm(zip(questions, truth_cids), total=total_q):
    combined = combine_top_answers(q, top_k_each=max(k_values))

    final = rerank(q, combined, top_k=max(k_values))

    ranked_cids = [item['cid'] for item in final]

    if truth in ranked_cids:
        rank = ranked_cids.index(truth) + 1
        reciprocal_ranks.append(1.0 / rank)
        for k in k_values:
            if rank <= k:
                correct_at_k[k] += 1
    else:
        reciprocal_ranks.append(0.0)

mrr = float(np.mean(reciprocal_ranks))
acc_at_k = {k: correct_at_k[k] / total_q * 100 for k in k_values}

print("\n=== Hybrid Retrieval + Reranker Evaluation ===")
print(f"Queries evaluated: {total_q}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
for k in k_values:
    print(f"Accuracy@{k}: {acc_at_k[k]:.2f}%")
