## Installation

In [None]:
!pip install sentence-transformers



In [None]:
!pip install pytrec_eval



### Imports

In [None]:
"""
This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).

The query and the passage are passed simoultanously to a Transformer network. The network then returns
a score between 0 and 1 how relevant the passage is for a given query.

The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
for a given query, for example with ElasticSearch, and pass the query+retrieved_passage to the CrossEncoder
for scoring. You sort the results then according to the output of the CrossEncoder.

This gives a significant boost compared to out-of-the-box ElasticSearch / BM25 ranking.
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
import logging
from collections import defaultdict
import numpy as np
import sys
import pytrec_eval
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S')

## Evaluating preparation

### Initialize hyperparameters (e.g., batch size, etc)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
base_path = "./gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/"

Mounted at /content/gdrive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir -p $base_path

## Evaluate the model


### Load the fine-tuned model that you trained using the previous notebook. You need to set the path of your own fine-tuned model here.

In [None]:
model_save_path = "./gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/finetuned_models/cross-encoder-cross-encoder-ms-marco-MiniLM-L-2-v2-2023-04-10_13-13-29/" #@param {type:"string"}

### Load data (For evaluation on TREC DL'19)

In [None]:
!wget https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
!tar -xvzf  queries.tar.gz

--2024-05-05 13:21:09--  https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
Resolving msmarco.z22.web.core.windows.net (msmarco.z22.web.core.windows.net)... 20.150.34.1
Connecting to msmarco.z22.web.core.windows.net (msmarco.z22.web.core.windows.net)|20.150.34.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18882551 (18M) [application/gzip]
Saving to: ‘queries.tar.gz’


2024-05-05 13:21:11 (13.9 MB/s) - ‘queries.tar.gz’ saved [18882551/18882551]

queries.dev.tsv
queries.eval.tsv
queries.train.tsv


In [None]:
"""
This file evaluates CrossEncoder on the TREC 2019 Deep Learning (DL) Track: https://arxiv.org/abs/2003.07820

TREC 2019 DL is based on the corpus of MS Marco. MS Marco provides a sparse annotation, i.e., usually only a single
passage is marked as relevant for a given query. Many other highly relevant passages are not annotated and hence are treated
as an error if a model ranks those high.

TREC DL instead annotated up to 200 passages per query for their relevance to a given query. It is better suited to estimate
the model performance for the task of reranking in Information Retrieval.

Run:
python eval_cross-encoder-trec-dl.py cross-encoder-model-name

"""


data_folder = 'trec2019-data'
os.makedirs(data_folder, exist_ok=True)

#Read test queries
queries = {}
queries_filepath = os.path.join(data_folder, 'msmarco-test2019-queries.tsv.gz')
if not os.path.exists(queries_filepath):
    logging.info("Download "+os.path.basename(queries_filepath))
    util.http_get('https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz', queries_filepath)

with gzip.open(queries_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        qid, query = line.strip().split("\t")
        queries[qid] = query

#Read which passages are relevant
relevant_docs = defaultdict(lambda: defaultdict(int))
qrels_filepath = os.path.join(data_folder, '2019qrels-pass.txt')

if not os.path.exists(qrels_filepath):
    logging.info("Download "+os.path.basename(qrels_filepath))
    util.http_get('https://trec.nist.gov/data/deep/2019qrels-pass.txt', qrels_filepath)


with open(qrels_filepath) as fIn:
    for line in fIn:
        qid, _, pid, score = line.strip().split()
        score = int(score)
        if score > 0:
            relevant_docs[qid][pid] = score

# Only use queries that have at least one relevant passage
relevant_qid = []
for qid in queries:
    if len(relevant_docs[qid]) > 0:
        relevant_qid.append(qid)


# Read the top 1000 passages that are supposed to be re-ranked
passage_filepath = os.path.join(data_folder, 'msmarco-passagetest2019-top1000.tsv.gz')

if not os.path.exists(passage_filepath):
    logging.info("Download "+os.path.basename(passage_filepath))
    util.http_get('https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz', passage_filepath)



passage_cand = {}
with gzip.open(passage_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        qid, pid, query, passage = line.strip().split("\t")
        if qid not in passage_cand:
            passage_cand[qid] = []

        passage_cand[qid].append([pid, passage])

logging.info("Queries: {}".format(len(queries)))


INFO:root:Download msmarco-test2019-queries.tsv.gz
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): msmarco.z22.web.core.windows.net:443
DEBUG:urllib3.connectionpool:https://msmarco.z22.web.core.windows.net:443 "GET /msmarcoranking/msmarco-test2019-queries.tsv.gz HTTP/1.1" 200 4276


  0%|          | 0.00/4.28k [00:00<?, ?B/s]

INFO:root:Download 2019qrels-pass.txt
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): trec.nist.gov:443
DEBUG:urllib3.connectionpool:https://trec.nist.gov:443 "GET /data/deep/2019qrels-pass.txt HTTP/1.1" 200 187092


  0%|          | 0.00/187k [00:00<?, ?B/s]

INFO:root:Download msmarco-passagetest2019-top1000.tsv.gz
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): msmarco.z22.web.core.windows.net:443
DEBUG:urllib3.connectionpool:https://msmarco.z22.web.core.windows.net:443 "GET /msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz HTTP/1.1" 200 26634062


  0%|          | 0.00/26.6M [00:00<?, ?B/s]

INFO:root:Queries: 200


## Mini-LM

### Prediction

In [None]:
queries_result_list = []
run = {}
model_save_path = "/content/gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/finetuned_models/cross-encoder-cross-encoder-ms-marco-MiniLM-L-2-v2-2024-05-04_21-10-05"
model = CrossEncoder(model_save_path, max_length=512)
for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

### Evaluation

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

Queries: 43
NDCG@10: 60.55
Recall@100: 48.42
MAP@1000: 40.36


### Sorting candidate documents of each query based on their relevance score

In [None]:
import operator
for qid in run.keys():
  run[qid] = sorted(run[qid].items(), key=operator.itemgetter(1), reverse = True)

### Storing ranking run file

In [None]:
ranking_lines = []
for qid in run.keys():
  for rank, did_pred_score in enumerate(run[qid]):
    did, pred_score = did_pred_score
    line = "{qid} Q0 {did} {rank} {pred_score} STANDARD".format(qid=qid, did=did, rank=rank, pred_score=str(pred_score))
    ranking_lines.append(line)

In [None]:
ranking_run_file_path = model_save_path + "ranking.run"
f_w = open(ranking_run_file_path, "w+")
f_w.write("\n".join(ranking_lines))
f_w.close()

### Print the first three lines of the stored ranking run file

In [None]:
!head -n 3 $ranking_run_file_path

156493 Q0 1960255 0 0.9419941902160645 STANDARD
156493 Q0 1960260 1 0.9321132302284241 STANDARD
156493 Q0 6139386 2 0.8972063064575195 STANDARD


## Tiny-BERT

### Prediction

In [None]:
queries_result_list = []
run = {}
model_save_path = "/content/gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/finetuned_models/cross-encoder-cross-encoder-ms-marco-TinyBERT-L-2-v2-2024-05-04_22-28-09"
model = CrossEncoder(model_save_path, max_length=512)
for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

### Evaluation

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

### Sorting candidate documents of each query based on their relevance score

In [None]:
import operator
for qid in run.keys():
  run[qid] = sorted(run[qid].items(), key=operator.itemgetter(1), reverse = True)

### Storing ranking run file

In [None]:
ranking_lines = []
for qid in run.keys():
  for rank, did_pred_score in enumerate(run[qid]):
    did, pred_score = did_pred_score
    line = "{qid} Q0 {did} {rank} {pred_score} STANDARD".format(qid=qid, did=did, rank=rank, pred_score=str(pred_score))
    ranking_lines.append(line)

In [None]:
ranking_run_file_path = model_save_path + "ranking.run"
f_w = open(ranking_run_file_path, "w+")
f_w.write("\n".join(ranking_lines))
f_w.close()

### Print the first three lines of the stored ranking run file

In [None]:
!head -n 3 $ranking_run_file_path

## distilroBERTa

### Prediction

In [None]:
queries_result_list = []
run = {}
model_save_path = "/content/gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/finetuned_models/cross-encoder-distilbert-distilroberta-base-2024-05-05_10-29-29"
model = CrossEncoder(model_save_path, max_length=512)
for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cpu
  0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  2%|▏         | 1/43 [05:25<3:48:11, 325.99s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  5%|▍         | 2/43 [11:06<3:48:40, 334.66s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  7%|▋         | 3/43 [17:31<3:58:24, 357.61s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  9%|▉         | 4/43 [22:11<3:32:29, 326.91s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 12%|█▏        | 5/43 [27:40<3:27:37, 327.82s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 14%|█▍        | 6/43 [32:23<3:12:34, 312.28s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 16%|█▋        | 7/43 [38:28<3:17:50, 329.74s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 19%|█▊        | 8/43 [43:26<3:06:26, 319.61s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 21%|██        | 9/43 [49:04<3:04:20, 325.31s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 23%|██▎       | 10/43 [55:04<3:04:51, 336.10s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 26%|██▌       | 11/43 [1:01:35<3:08:12, 352.91s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 28%|██▊       | 12/43 [1:07:14<3:00:04, 348.53s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 30%|███       | 13/43 [1:12:50<2:52:27, 344.93s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 33%|███▎      | 14/43 [1:17:54<2:40:41, 332.45s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 35%|███▍      | 15/43 [1:23:37<2:36:38, 335.66s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 37%|███▋      | 16/43 [1:29:46<2:35:34, 345.72s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 40%|███▉      | 17/43 [1:34:14<2:19:38, 322.25s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 42%|████▏     | 18/43 [1:40:53<2:23:53, 345.34s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 44%|████▍     | 19/43 [1:45:38<2:10:53, 327.22s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 47%|████▋     | 20/43 [1:51:28<2:08:00, 333.92s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 49%|████▉     | 21/43 [1:56:08<1:56:30, 317.75s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 51%|█████     | 22/43 [2:02:20<1:56:57, 334.18s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 53%|█████▎    | 23/43 [2:07:30<1:48:57, 326.89s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 56%|█████▌    | 24/43 [2:13:03<1:44:05, 328.72s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 58%|█████▊    | 25/43 [2:17:30<1:33:04, 310.25s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 60%|██████    | 26/43 [2:22:39<1:27:46, 309.81s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 63%|██████▎   | 27/43 [2:28:42<1:26:51, 325.70s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 65%|██████▌   | 28/43 [2:34:28<1:22:59, 331.97s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 67%|██████▋   | 29/43 [2:39:46<1:16:29, 327.81s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 70%|██████▉   | 30/43 [2:44:35<1:08:30, 316.20s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 72%|███████▏  | 31/43 [2:50:14<1:04:33, 322.80s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 74%|███████▍  | 32/43 [2:54:52<56:43, 309.45s/it]  

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 77%|███████▋  | 33/43 [2:55:04<36:41, 220.18s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 79%|███████▉  | 34/43 [2:59:54<36:09, 241.11s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 81%|████████▏ | 35/43 [2:59:55<22:32, 169.03s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 84%|████████▎ | 36/43 [3:04:33<23:33, 201.89s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 86%|████████▌ | 37/43 [3:09:01<22:09, 221.61s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 88%|████████▊ | 38/43 [3:14:45<21:32, 258.44s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 91%|█████████ | 39/43 [3:20:38<19:06, 286.66s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 93%|█████████▎| 40/43 [3:25:59<14:51, 297.20s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 95%|█████████▌| 41/43 [3:31:03<09:57, 298.96s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 98%|█████████▊| 42/43 [3:36:08<05:00, 300.85s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 43/43 [3:41:51<00:00, 309.57s/it]


### Evaluation

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

### Sorting candidate documents of each query based on their relevance score

In [None]:
import operator
for qid in run.keys():
  run[qid] = sorted(run[qid].items(), key=operator.itemgetter(1), reverse = True)

### Storing ranking run file

In [None]:
ranking_lines = []
for qid in run.keys():
  for rank, did_pred_score in enumerate(run[qid]):
    did, pred_score = did_pred_score
    line = "{qid} Q0 {did} {rank} {pred_score} STANDARD".format(qid=qid, did=did, rank=rank, pred_score=str(pred_score))
    ranking_lines.append(line)

In [None]:
ranking_run_file_path = model_save_path + "ranking.run"
f_w = open(ranking_run_file_path, "w+")
f_w.write("\n".join(ranking_lines))
f_w.close()

### Print the first three lines of the stored ranking run file

In [None]:
!head -n 3 $ranking_run_file_path