In [None]:
import requests
from entry_rag import (
    corpus_mapping,
    model_llm_mapping,
    model_retriever_mapping,
)


dataset = "msmarco"  # msmarco / hotpotqa / nq

data = requests.get(
    f"https://raw.githubusercontent.com/sleeepeer/PoisonedRAG/refs/heads/main/results/adv_targeted_results/{dataset}.json",
).json()
len(data)

In [None]:
questions = []
correct_answers = []
incorrect_answers = []
adv_texts = []

for data in data.values():
    questions.append(data["question"])
    correct_answers.append(data["correct answer"])
    incorrect_answers.append(data["incorrect answer"])
    adv_texts.append(data["adv_texts"])
len(questions), len(correct_answers), len(incorrect_answers), len(adv_texts)

In [None]:
import json
from tqdm import trange
import torch
from torch.nn import functional as F
from utils import set_global_pooling_type, pooling


def _normalized_embedding(tokenizer, model, sentences, batch_size=256):
    assert isinstance(sentences, list), "Sentences must be a list of string"
    all_embeddings = []
    for i in trange(0, len(sentences), batch_size):
        encoded_input = tokenizer(
            sentences[i : i + batch_size],
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=256,
        ).to(model.device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        attention_mask = encoded_input["attention_mask"]
        mean_embeddings = pooling(model_output[0], attention_mask)
        sentence_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
        all_embeddings.append(sentence_embeddings)
    result = torch.concat(all_embeddings, dim=0)
    return result


from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(model_retriever_mapping["contriever"])
model = AutoModel.from_pretrained(model_retriever_mapping["contriever"]).to("cuda")

set_global_pooling_type("mean")

questions_embeddings = _normalized_embedding(tokenizer, model, questions)

with open(corpus_mapping[dataset], "r") as f:
    corpus = [json.loads(line) for line in f]

corpus_embeddings = _normalized_embedding(tokenizer, model, [c["text"] for c in corpus])

from sentence_transformers import util

top_k = 50

results = []

for i in trange(len(questions)):
    query = questions[i]
    query_embedding = questions_embeddings[i].unsqueeze(0)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)

    result = {
        "query": query,
        "similarities": [],
        "hits": [],
    }
    for hit in hits[0]:
        result["similarities"].append(hit["score"])
        result["answers"].append(corpus[hit["corpus_id"]]["text"])
    results.append(result)

with open(
    f"./data/poisoned_rag_{dataset}/{dataset}_contriever_ground-truth.json", "w"
) as f:
    json.dump(results, f)

In [None]:
from tqdm import trange
import torch
from torch.nn import functional as F
import json
from sentence_transformers import util


def _normalized_embedding(tokenizer, model, sentences, batch_size=256):
    assert isinstance(sentences, list), "Sentences must be a list of string"
    all_embeddings = []
    for i in trange(0, len(sentences), batch_size):
        encoded_input = tokenizer(
            sentences[i : i + batch_size],
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=256,
        ).to(model.device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        attention_mask = encoded_input["attention_mask"]
        mean_embeddings = pooling(model_output[0], attention_mask)
        sentence_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
        all_embeddings.append(sentence_embeddings)
    result = torch.concat(all_embeddings, dim=0)
    return result


from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(model_retriever_mapping["bge"])
model = AutoModel.from_pretrained(model_retriever_mapping["bge"]).to("cuda")

set_global_pooling_type("cls")

questions_embeddings = _normalized_embedding(tokenizer, model, questions)

with open(corpus_mapping[dataset], "r") as f:
    corpus = [json.loads(line) for line in f]

corpus_embeddings = _normalized_embedding(tokenizer, model, [c["text"] for c in corpus])

top_k = 50

results = []

for i in trange(len(questions)):
    query = questions[i]
    query_embedding = questions_embeddings[i].unsqueeze(0)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)

    result = {
        "query": query,
        "similarities": [],
        "hits": [],
    }
    for hit in hits[0]:
        result["similarities"].append(hit["score"])
        result["hits"].append(corpus[hit["corpus_id"]]["text"])
    results.append(result)

with open(f"./{dataset}_bge_ground-truth.json", "w") as f:
    json.dump(results, f)