In [None]:
!pip install beir transformers sentence-transformers --quiet

Here we import the dataset in beir standard format i .e it has three parts corpus , queries and qrels

In [None]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader

url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/hotpotqa.zip"
out_dir = f"./hotpotqa"
data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
print(f"Sample Query: {list(queries.values())[0]}")
print(f"Sample Corpus: {list(corpus.values())[0]}")
print(f"Sample qrel: {list(qrels.values())[0]}")


here we import the models and generate queru embedding and corpus embedding and find the cosine similarity between them, then we generate top 10 passages which are most similar

In [None]:
from sentence_transformers import SentenceTransformer
import torch

smallembmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
largeembmodel = SentenceTransformer('Snowflake/snowflake-arctic-embed-l')
# Here i tried to do the corpus encoding in batches if GPU memory runs out
# def encode_in_batches(corpus_texts, model, batch_size=64):
#     corpus_embeddings = []
#     for i in range(0, len(corpus_texts), batch_size):
#         batch_texts = corpus_texts[i:i + batch_size]
#         batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
#         corpus_embeddings.append(batch_embeddings)
#     return torch.cat(corpus_embeddings, dim=0)

def retrieve_candidates(query, model):
    query_embedding = model.encode(query, convert_to_tensor=True)
    corpus_embeddings = model.encode([doc['text'] for doc_id, doc in corpus.items()], convert_to_tensor=True)
    # corpus_texts = [doc['text'] for doc_id, doc in corpus.items()]
    # corpus_embeddings = encode_in_batches(corpus_texts, model, batch_size)
    scores = torch.matmul(query_embedding, corpus_embeddings.T).squeeze(0)
    top_k_indices = torch.topk(scores, k=10).indices
    top_k_passages = [list(corpus.values())[i]['text'] for i in top_k_indices]
    return top_k_passages

query = list(queries.values())[0]
topkpassagessmall = retrieve_candidates(query, smallembmodel)
topkpassageslarge = retrieve_candidates(query, largeembmodel)
print("Top-k passages (small model):", topkpassagessmall)
print("Top-k passages (large model):", topkpassageslarge)

here the ranker models are imported which will help in reranking the top-k passages for a given query, it first tokenize the queries, then it is passed throught the re ranked model after that it generates a relevance score according to that it the passages are rearranged

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

smallrerankmodel = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')
largererankmodel = AutoModelForSequenceClassification.from_pretrained('mixedbread-ai/mxbai-rerank-large-v1')
smallreranktokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')
largereranktokenizer = AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-large-v1')

def rerank_passages(query, passages, model, tokenizer):
    inputs = tokenizer([query] * len(passages), passages, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**inputs)
    scores = outputs.logits[:, 0]
    ranked_indices = torch.argsort(scores, descending=True)
    ranked_passages = [passages[i] for i in ranked_indices]
    return ranked_passages

rankedpassagessmall = rerank_passages(query, topkpassagessmall, smallrerankmodel, smallreranktokenizer)
rankedpassageslarge = rerank_passages(query, topkpassageslarge, largererankmodel, largereranktokenizer)


here the relevance labels are generated based on the ground truth data (qrels), and the ranking quality is measured against an ideal ranking using NDCG.

In [None]:
from sklearn.metrics import ndcg_score
import numpy as np

def create_relevance_labels(query_id, corpus_ids, qrels):
    relevance_labels = []
    for doc_id in corpus_ids:
        if doc_id in qrels.get(query_id, {}):
            relevance_labels.append(1)
        else:
            relevance_labels.append(0)
    return relevance_labels

def compute_ndcg_at_k(query_id, passages, top_k_passages, model_name, qrels):
    top_k_doc_ids = [list(corpus.keys())[list(corpus.values()).index({'text': passage})] for passage in top_k_passages]
    relevance_labels = create_relevance_labels(query_id, top_k_doc_ids, qrels)
    ndcg = ndcg_score([relevance_labels], [list(range(len(top_k_doc_ids), 0, -1))])
    print(f"NDCG@10 ({model_name}): {ndcg}")
    return ndcg

query_id = list(queries.keys())[0]
compute_ndcg_at_k(query_id, topkpassagessmall, rankedpassagessmall, "Small Model", qrels)
compute_ndcg_at_k(query_id, topkpassageslarge, rankedpassageslarge, "Large Model", qrels)


**I tried to run the model but after some time aprox 2 hrs into training i got an Out Of CUDA memory error , i also tried running in batches with batch size of 64 but the error still prevails . Therefore I was not able to generate the output .**