In [None]:
from entry_rag import (
    fake_corpus_mapping,
    model_llm_mapping,
    model_retriever_mapping,
)

import json


dataset = "msmarco"

with open(fake_corpus_mapping["msmarco"], "r") as f:
    all_data = json.load(f)
    print(len(all_data))

for query, corpus in all_data.items():
    print(query)
    print(corpus)
    break

In [None]:
from tqdm import trange
import torch
from torch.nn import functional as F
from utils import set_global_pooling_type, pooling


def _normalized_embedding(tokenizer, model, sentences, batch_size=256):
    assert isinstance(sentences, list), "Sentences must be a list of string"
    all_embeddings = []
    for i in trange(0, len(sentences), batch_size):
        encoded_input = tokenizer(
            sentences[i : i + batch_size],
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=256,
        ).to(model.device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        attention_mask = encoded_input["attention_mask"]
        mean_embeddings = pooling(model_output[0], attention_mask)
        sentence_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
        all_embeddings.append(sentence_embeddings)
    result = torch.concat(all_embeddings, dim=0)
    return result


from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(model_retriever_mapping["contriever"])
model = AutoModel.from_pretrained(model_retriever_mapping["contriever"])

set_global_pooling_type("mean")

results = []
for query, corpus in all_data.items():
    query_embedding = _normalized_embedding(tokenizer, model, [query])
    corpus_embeddings = _normalized_embedding(tokenizer, model, corpus)
    from sentence_transformers import util

    top_k = 10

    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)

    result = {
        "query": query,
        "similarities": [],
        "hits": [],
    }
    for hit in hits[0]:
        result["similarities"].append(hit["score"])
        result["hits"].append(corpus[hit["corpus_id"]])
    results.append(result)

with open(
    f"./data/poisoned_rag_{dataset}/{dataset}_fakecorpus_contriever_ground-truth.json",
    "w",
) as f:
    json.dump(results, f)

In [None]:
from tqdm import trange
import torch
from torch.nn import functional as F
import json
from sentence_transformers import util
from utils import set_global_pooling_type, pooling


def _normalized_embedding(tokenizer, model, sentences, batch_size=256):
    assert isinstance(sentences, list), "Sentences must be a list of string"
    all_embeddings = []
    for i in trange(0, len(sentences), batch_size):
        encoded_input = tokenizer(
            sentences[i : i + batch_size],
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=256,
        ).to(model.device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        attention_mask = encoded_input["attention_mask"]
        mean_embeddings = pooling(model_output[0], attention_mask)
        sentence_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
        all_embeddings.append(sentence_embeddings)
    result = torch.concat(all_embeddings, dim=0)
    return result


from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(model_retriever_mapping["bge"])
model = AutoModel.from_pretrained(model_retriever_mapping["bge"])

set_global_pooling_type("cls")

results = []
for query, corpus in all_data.items():
    query_embedding = _normalized_embedding(tokenizer, model, [query])
    corpus_embeddings = _normalized_embedding(tokenizer, model, corpus)
    from sentence_transformers import util

    top_k = 10

    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)

    result = {
        "query": query,
        "similarities": [],
        "hits": [],
    }
    for hit in hits[0]:
        result["similarities"].append(hit["score"])
        result["hits"].append(corpus[hit["corpus_id"]])

    results.append(result)
with open(f"./data/{dataset}_fakecorpus_bge_ground-truth.json", "w") as f:
    json.dump(results, f)