In [1]:
!pip install beir transformers sentence-transformers faiss-gpu peft --quiet

In [2]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader

# Download HotpotQA dataset
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/hotpotqa.zip"
out_dir = "./hotpotqa"
data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  from tqdm.autonotebook import tqdm


./hotpotqa/hotpotqa.zip:   0%|          | 0.00/624M [00:00<?, ?iB/s]

  0%|          | 0/5233329 [00:00<?, ?it/s]

In [3]:
from peft import LoraConfig, get_peft_model
from sentence_transformers import SentenceTransformer

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
lora_config = LoraConfig(
    r=8,  
    lora_alpha=32,  
    lora_dropout=0.1, 
    target_modules=["dense", "query", "key", "value"]  
)

lora_model = get_peft_model(model, lora_config)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
import torch

def retrieve_candidates(query, top_k=10, model=lora_model):
    query_embedding = model.encode([query], convert_to_tensor=True)
    corpus_embeddings = model.encode([doc['text'] for doc_id, doc in corpus.items()], convert_to_tensor=True)
    scores = torch.matmul(query_embedding, corpus_embeddings.T).squeeze(0)
    top_k_indices = torch.topk(scores, k=top_k).indices
    return [list(corpus.values())[i]['text'] for i in top_k_indices]

query = list(queries.values())[5]
print("Query:", query)
top_k_passages = retrieve_candidates(query)
print("Top-k Passages:", top_k_passages)


Query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/163542 [00:00<?, ?it/s]

Top-k Passages: ["2014 S/S is the debut album of South Korean group WINNER. It was released on August 12, 2014 by the group's record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album's songs.", 'The discography of South Korean girl group S.E.S. consists of seven studio albums, four compilation albums, three video albums, and one remix album. The group debuted in 1997 under SM Entertainment and disbanded in 2002. The group then made their comeback in 2016.', 'This is the discography of the South Korean boy group Teen Top. The group have been in the Korean music business since July 2010, debuting with their single, "Clap". They have released a total of eleven albums consisting of two studio albums, seven extended plays, three single albums, and one compilation album.', "S.T 01 Now is South Korean boy band SS501's first full-length studio album, released on November 10, 2006 by DSP Media.", 'The discography of the South Korea

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

def load_lora_rerank_model(model_name, lora_r=8, lora_alpha=16, lora_dropout=0.1):
    base_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=["query", "key", "value", "output.dense"]  
    )
    
    lora_model = get_peft_model(base_model, lora_config)
    return lora_model


tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')

def rerank_passages(query, top_k_passages):
    input_pairs = [[query, passage] for passage in top_k_passages]
    inputs = tokenizer(input_pairs, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = reranker_model(**inputs)
        rerank_scores = outputs.logits.squeeze(-1)  
        
    reranked_passages = sorted(zip(top_k_passages, rerank_scores), key=lambda x: x[1], reverse=True)
    return [passage for passage, score in reranked_passages]

reranker_model = load_lora_rerank_model('cross-encoder/ms-marco-MiniLM-L-12-v2')
reranked_passages = rerank_passages(query, top_k_passages)
print("Query:", query)
print("Reranked Passages:", reranked_passages)

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?
Reranked Passages: ["2014 S/S is the debut album of South Korean group WINNER. It was released on August 12, 2014 by the group's record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album's songs.", 'S (에스) is a South Korean project group consisting of three members: Kangta, Lee Ji-hoon and Shin Hye-sung. The group debuted in 2003, under the SM Entertainment label. After 11 years, they released and promoted another mini-album in 2014.', 'The discography of South Korean girl group S.E.S. consists of seven studio albums, four compilation albums, three video albums, and one remix album. The group debuted in 1997 under SM Entertainment and disbanded in 2002. The group then made their comeback in 2016.', 'The discography of South Korean boy band 24K, formed by Choeun Entertainment consists of one studio album, three extended plays (EPs), one si

In [8]:
      
from sklearn.metrics import ndcg_score

def create_relevance_labels(query_id, corpus_ids, qrels):
    relevance_labels = []
    for doc_id in corpus_ids:
        relevance_labels.append(1 if doc_id in qrels.get(query_id, {}) else 0)
    return relevance_labels

def compute_ndcg_at_k(query_id, top_k_passages, qrels):
    # Match top_k_passages with document IDs in the corpus
    top_k_doc_ids = []
    for passage in top_k_passages:
        for doc_id, doc in corpus.items():
            if doc['text'] == passage:
                top_k_doc_ids.append(doc_id)
                break
    relevance_labels = create_relevance_labels(query_id, top_k_doc_ids, qrels)
    ndcg = ndcg_score([relevance_labels], [list(range(len(top_k_doc_ids), 0, -1))])
    print(f"NDCG@10: {ndcg}")
    return ndcg

# Evaluate the model on a sample query
query_id = list(queries.keys())[5]
compute_ndcg_at_k(query_id, top_k_passages, qrels)

NDCG@10: 1.0


1.0