# Stiven Saldaña

# Ejercicio 10: Re-ranking
**Objetivo:** Implementar y evaluar un pipeline de Recuperación de Información en dos etapas, y analizar el impacto del re-ranking en la calidad del ranking.

#Parte 1. Preparación del corpus
* Cargar el corpus (documentos/pasajes).
* Cargar las consultas (queries).
* Cargar qrels (relevancia).

In [1]:
!pip install beir sentence-transformers rank_bm25 lightgbm

Collecting beir
  Downloading beir-2.2.0-py3-none-any.whl.metadata (28 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pytrec-eval-terrier (from beir)
  Downloading pytrec_eval_terrier-0.5.10-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Downloading beir-2.2.0-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading pytrec_eval_terrier-0.5.10-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (304 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rank_bm25, pytrec-eval-terrier, beir
Successfully installed beir-2.2.0 pytrec-eval-terrier-0.5.10 rank_bm25-0.2.2


In [2]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
import pandas as pd

  from tqdm.autonotebook import tqdm


In [None]:
DATASET_NAME = "scifact"
DATA_DIR = "../data/beir_datasets"
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{DATASET_NAME}.zip"
util.download_and_unzip(url, DATA_DIR)

In [None]:
dataset_path = DATA_DIR + "/" + DATASET_NAME
corpus, queries, qrels = GenericDataLoader(dataset_path).load(split="test")

In [5]:
df_corpus = (
    pd.DataFrame.from_dict(corpus, orient="index")
      .reset_index()
      .rename(columns={"index": "doc_id"})
)

df_corpus


Unnamed: 0,doc_id,text,title
0,4983,Alterations of the architecture of cerebral wh...,Microstructural development of human newborn c...
1,5836,Myelodysplastic syndromes (MDS) are age-depend...,Induction of myelodysplasia by myeloid-derived...
2,7912,ID elements are short interspersed elements (S...,"BC1 RNA, the transcript from a master gene for..."
3,18670,DNA methylation plays an important role in bio...,The DNA Methylome of Human Peripheral Blood Mo...
4,19238,Two human Golli (for gene expressed in the oli...,The human myelin basic protein gene is include...
...,...,...,...
5178,195689316,BACKGROUND The main associations of body-mass ...,Body-mass index and cause-specific mortality i...
5179,195689757,A key aberrant biological difference between t...,Targeting metabolic remodeling in glioblastoma...
5180,196664003,A signaling pathway transmits information from...,Signaling architectures that transmit unidirec...
5181,198133135,AIMS Trabecular bone score (TBS) is a surrogat...,"Association between pre-diabetes, type 2 diabe..."


In [6]:
df_queries = (
    pd.DataFrame.from_dict(queries, orient="index", columns=["query"])
      .reset_index()
      .rename(columns={"index": "query_id"})
)

df_queries

Unnamed: 0,query_id,query
0,1,0-dimensional biomaterials show inductive prop...
1,3,"1,000 genomes project enables mapping of genet..."
2,5,1/2000 in UK have abnormal PrP positivity.
3,13,5% of perinatal mortality is due to low birth ...
4,36,A deficiency of vitamin B12 increases blood le...
...,...,...
295,1379,Women with a higher birth weight are more like...
296,1382,aPKCz causes tumour enhancement by affecting g...
297,1385,cSMAC formation enhances weak ligand signalling.
298,1389,mTORC2 regulates intracellular cysteine levels...


In [7]:
rows = []
for qid, docs in qrels.items():
    for doc_id, rel in docs.items():
        rows.append({
            "query_id": qid,
            "doc_id": doc_id,
            "relevance": rel
        })

df_qrels = pd.DataFrame(rows)
df_qrels

Unnamed: 0,query_id,doc_id,relevance
0,1,31715818,1
1,3,14717500,1
2,5,13734012,1
3,13,1606628,1
4,36,5152028,1
...,...,...,...
334,1379,17450673,1
335,1382,17755060,1
336,1385,306006,1
337,1389,23895668,1


In [21]:
# Elegimos una query cualquiera que tenga varios documentos relevantes
qid = "1379"

print("Query:")
print(df_queries.loc[df_queries["query_id"] == qid, "query"].values[0])

print("\nDocumentos relevantes para esta query:")
df_qrels[(df_qrels["query_id"] == qid) & (df_qrels["relevance"] > 0)]

Query:
Women with a higher birth weight are more likely to develop breast cancer later in life.

Documentos relevantes para esta query:


Unnamed: 0,query_id,doc_id,relevance
331,1379,16322674,1
332,1379,27123743,1
333,1379,23557241,1
334,1379,17450673,1


#Parte 2. Retrieval inicial (baseline)
* Implementar retrieval inicial con BM25
* Obtener métricas: Recall@10 nDCG@10

In [22]:
!pip install rank_bm25 sentence-transformers lightgbm scikit-learn



In [24]:
import numpy as np
from rank_bm25 import BM25Okapi
from tqdm import tqdm

# Implementacion de BM25
tokenized_corpus = [doc.split(" ") for doc in df_corpus["text"].tolist()]
bm25 = BM25Okapi(tokenized_corpus)

doc_id_map = {row['doc_id']: row['text'] for _, row in df_corpus.iterrows()}
doc_ids_list = df_corpus['doc_id'].tolist()

# funcion BM25
def get_bm25_candidates(queries_df, top_k=100):
    results = {}

    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), desc="BM25 Retrieval"):
        qid = row['query_id']
        query_text = row['query']

        tokenized_query = query_text.split(" ")
        scores = bm25.get_scores(tokenized_query)

        top_n_indices = np.argsort(scores)[::-1][:top_k]

        results[qid] = {}
        for idx in top_n_indices:
            doc_id = doc_ids_list[idx]
            results[qid][doc_id] = scores[idx]

    return results

# BM25 (Baseline)
bm25_results = get_bm25_candidates(df_queries, top_k=100)


BM25 Retrieval: 100%|██████████| 300/300 [00:06<00:00, 47.64it/s]


# Se obtiene las metricas

In [25]:
def evaluate_results(results_dict, qrels_df, k=10):
    ndcg_list = []
    recall_list = []

    qrels_dict = {}
    for _, row in qrels_df.iterrows():
        if row['relevance'] > 0:
            if row['query_id'] not in qrels_dict: qrels_dict[row['query_id']] = {}
            qrels_dict[row['query_id']][row['doc_id']] = row['relevance']

    for qid, doc_scores in results_dict.items():
        if qid not in qrels_dict: continue

        true_relevants = qrels_dict[qid]

        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        retrieved_ids = [d[0] for d in sorted_docs]

        #Recall@
        hits = sum([1 for doc in retrieved_ids if doc in true_relevants])
        recall = hits / len(true_relevants)
        recall_list.append(recall)

        #nDCG@K
        dcg = 0.0
        for i, doc in enumerate(retrieved_ids):
            if doc in true_relevants:
                rel = true_relevants[doc]
                dcg += rel / np.log2(i + 2)

        ideal_rels = sorted(true_relevants.values(), reverse=True)
        idcg = 0.0
        for i, rel in enumerate(ideal_rels[:k]):
            idcg += rel / np.log2(i + 2)

        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_list.append(ndcg)

    return {
        "Recall@10": np.mean(recall_list),
        "nDCG@10": np.mean(ndcg_list)
    }

metrics_bm25 = evaluate_results(bm25_results, df_qrels, k=10)
print("\n Resultados")
print(metrics_bm25)


 Resultados
{'Recall@10': np.float64(0.6247222222222222), 'nDCG@10': np.float64(0.5056177576488845)}


# Parte 3. Implementación del re-ranking cross-encoder
* Re-rankear los top-k candidatos para cada query.
* Identificar qué documentos cambian de posición en el top 10

In [33]:
from sentence_transformers import CrossEncoder

ce_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# funcion rank
def rerank_cross_encoder(initial_results, queries_df, corpus_map):
    ce_results = {}

    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), desc="CE Re-ranking"):
        qid = row['query_id']
        query_text = row['query']

        if qid not in initial_results: continue

        # se toma el resultado de BM25
        candidates = list(initial_results[qid].keys())

        pairs = [[query_text, corpus_map[doc_id]] for doc_id in candidates if doc_id in corpus_map]

        if not pairs: continue

        scores = ce_model.predict(pairs)

        ce_results[qid] = {candidates[i]: float(scores[i]) for i in range(len(scores))}

    return ce_results

# Re-ranking
ce_results = rerank_cross_encoder(bm25_results, df_queries, doc_id_map)

# se calcula las métricas
metrics_ce = evaluate_results(ce_results, df_qrels, k=10)
print("\n Resultados")
print(metrics_ce)

target_qid = "1379"
rank_bm25 = sorted(bm25_results[target_qid].items(), key=lambda x: x[1], reverse=True)
rank_ce = sorted(ce_results[target_qid].items(), key=lambda x: x[1], reverse=True)

ids_bm25 = [x[0] for x in rank_bm25]
ids_ce = [x[0] for x in rank_ce]


CE Re-ranking: 100%|██████████| 300/300 [02:43<00:00,  1.84it/s]


 Resultados
{'Recall@10': np.float64(0.7234999999999999), 'nDCG@10': np.float64(0.6261648106112503)}





In [35]:
print(f"{'Doc ID':<10} | {'Rank BM25':<10} | {'Rank CE':<10} | {'Cambio'}")
for i, doc_id in enumerate(ids_ce[:10]):
    rank_original = ids_bm25.index(doc_id) if doc_id in ids_bm25 else -1
    change = rank_original - i
    print(f"{doc_id:<10} | {rank_original:<10} | {i:<10} | {change:+d}")

Doc ID     | Rank BM25  | Rank CE    | Cambio
27123743   | 74         | 0          | +74
37480103   | 1          | 1          | +0
5487448    | 3          | 2          | +1
17450673   | 0          | 3          | -3
23557241   | 30         | 4          | +26
25973484   | 10         | 5          | +5
16322674   | 5          | 6          | -1
13765757   | 11         | 7          | +4
16098747   | 19         | 8          | +11
5864770    | 65         | 9          | +56


# Parte 4. Implementación del re-ranking LTR
* Re-rankear los top-k candidatos para cada query.
* Identificar qué documentos cambian de posición en el top 10

In [34]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
import matplotlib.pyplot as plt

# crear dataset
ltr_data = []
qrels_lookup = df_qrels.set_index(['query_id', 'doc_id'])['relevance'].to_dict()

for qid, docs in bm25_results.items():
    for doc_id, score in docs.items():
        feat_bm25 = score
        feat_len = len(doc_id_map[doc_id].split())
        # relevancia
        target = qrels_lookup.get((qid, doc_id), 0)

        ltr_data.append([qid, doc_id, feat_bm25, feat_len, target])

df_ltr = pd.DataFrame(ltr_data, columns=['qid', 'doc_id', 'bm25', 'doc_len', 'rel'])

# grupos de Queries
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(df_ltr, groups=df_ltr['qid']))

train_df = df_ltr.iloc[train_idx].sort_values('qid')
test_df = df_ltr.iloc[test_idx].sort_values('qid')

X_train = train_df[['bm25', 'doc_len']]
y_train = train_df['rel']
group_train = train_df.groupby('qid').size().to_list()

X_test = test_df[['bm25', 'doc_len']]

#Ranker
ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=100
)
ranker.fit(X_train, y_train, group=group_train)

# Re-rankear
df_ltr['score_ltr'] = ranker.predict(df_ltr[['bm25', 'doc_len']])

ltr_results = {}
for qid, group in df_ltr.groupby('qid'):
    ltr_results[qid] = dict(zip(group['doc_id'], group['score_ltr']))

target_qid = "1379"
print(f"\n Cambios de posición para Query {target_qid}")

if target_qid in ltr_results and target_qid in bm25_results:
    rank_bm25 = sorted(bm25_results[target_qid].items(), key=lambda x: x[1], reverse=True)
    ids_bm25 = [x[0] for x in rank_bm25]

    rank_ltr = sorted(ltr_results[target_qid].items(), key=lambda x: x[1], reverse=True)
    ids_ltr = [x[0] for x in rank_ltr]

    print(f"{'Doc ID':<10} | {'Rank BM25':<10} | {'Rank LTR':<10} | {'Cambio'}")
    print("-" * 50)

    for i, doc_id in enumerate(ids_ltr[:10]):
        rank_original = ids_bm25.index(doc_id) if doc_id in ids_bm25 else -1
        change = rank_original - i
        print(f"{doc_id:<10} | {rank_original:<10} | {i:<10} | {change:+d}")
else:
    print(f"Error")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 2

 Cambios de posición para Query 1379
Doc ID     | Rank BM25  | Rank LTR   | Cambio
--------------------------------------------------
17450673   | 0          | 0          | +0
37480103   | 1          | 1          | +0
16322674   | 5          | 2          | +3
8842332    | 4          | 3          | +1
23557241   | 30         | 4          | +26
2140497    | 6          | 5          | +1
40949706   | 51         | 6          | +45
38784540   | 36         | 7          | +29
5649538    | 83         | 8          | +75
29253460   | 7          | 9          | -2


# Parte 5. Evaluación post re-ranking
Calcular métricas:

* nDCG@10
* MAP
* Recall@10

In [30]:
import numpy as np

def evaluate_complete(results_dict, qrels_df, k=10):
    ndcg_list = []
    recall_list = []
    ap_list = []

    qrels_dict = {}
    for _, row in qrels_df.iterrows():
        if row['relevance'] > 0:
            if row['query_id'] not in qrels_dict: qrels_dict[row['query_id']] = {}
            qrels_dict[row['query_id']][row['doc_id']] = row['relevance']

    for qid, doc_scores in results_dict.items():
        if qid not in qrels_dict: continue

        true_relevants = qrels_dict[qid]
        num_relevant_total = len(true_relevants)

        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        top_k_docs = [d[0] for d in sorted_docs[:k]]

        #Recall@
        hits = sum([1 for doc in top_k_docs if doc in true_relevants])
        recall = hits / num_relevant_total
        recall_list.append(recall)

        #nDCG@
        dcg = 0.0
        for i, doc in enumerate(top_k_docs):
            if doc in true_relevants:
                rel = true_relevants[doc]
                dcg += rel / np.log2(i + 2)

        ideal_rels = sorted(true_relevants.values(), reverse=True)
        idcg = 0.0
        for i, rel in enumerate(ideal_rels[:k]):
            idcg += rel / np.log2(i + 2)

        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_list.append(ndcg)
        cumulative_hits = 0
        precision_sum = 0.0

        all_retrieved_ids = [d[0] for d in sorted_docs]

        for i, doc in enumerate(all_retrieved_ids):
            if doc in true_relevants:
                cumulative_hits += 1
                precision_at_i = cumulative_hits / (i + 1)
                precision_sum += precision_at_i

        if num_relevant_total > 0:
            ap = precision_sum / num_relevant_total
        else:
            ap = 0.0
        ap_list.append(ap)

    return {
        "nDCG@10": np.mean(ndcg_list),
        "MAP": np.mean(ap_list),
        "Recall@10": np.mean(recall_list)
    }

metricas_bm25 = evaluate_complete(bm25_results, df_qrels, k=10)
metricas_ce = evaluate_complete(ce_results, df_qrels, k=10)
metricas_ltr = evaluate_complete(ltr_results, df_qrels, k=10)

print("1. Baseline (BM25):")
print(f"   nDCG@10:   {metricas_bm25['nDCG@10']:.4f}")
print(f"   MAP:       {metricas_bm25['MAP']:.4f}")
print(f"   Recall@10: {metricas_bm25['Recall@10']:.4f}")

print("\n2. Re-ranking Cross-Encoder:")
print(f"   nDCG@10:   {metricas_ce['nDCG@10']:.4f}")
print(f"   MAP:       {metricas_ce['MAP']:.4f}")
print(f"   Recall@10: {metricas_ce['Recall@10']:.4f}")

print("\n3. Re-ranking LTR:")
print(f"   nDCG@10:   {metricas_ltr['nDCG@10']:.4f}")
print(f"   MAP:       {metricas_ltr['MAP']:.4f}")
print(f"   Recall@10: {metricas_ltr['Recall@10']:.4f}")

1. Baseline (BM25):
   nDCG@10:   0.5056
   MAP:       0.4691
   Recall@10: 0.6247

2. Re-ranking Cross-Encoder:
   nDCG@10:   0.6262
   MAP:       0.5912
   Recall@10: 0.7235

3. Re-ranking LTR:
   nDCG@10:   0.6875
   MAP:       0.6741
   Recall@10: 0.7196
