In [1]:
import os
import pyterrier as pt
import pyterrier_rag as ptr
from datasets import load_dataset
import ir_datasets
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
import pyterrier_alpha as pta

In [2]:
# ----------------------------------------------------------------------------
# 2. Caricamento dataset GPQA
# ----------------------------------------------------------------------------
print("🔍 Carico GPQA da Hugging Face...")
mmlu = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")

🔍 Carico GPQA da Hugging Face...


In [3]:
# Build a DataFrame from the dataset with 'query' and 'gold_answer' columns
df = pd.DataFrame(mmlu)
df = df.rename(columns={"Record ID": "qid","Question": "query", "Correct Answer": "gold_answer"})
df = df[["qid", "query", "gold_answer"]]

---

In [6]:
data = df.rename(columns={"qid": "qid", "query": "text"})
data = data[['qid', 'text']]
data["text"] = data["text"].str.replace('\n', '\\n')
data.to_csv("../data/raw/rag/gpqa-queries.tsv", sep="\t", index=False)

In [4]:
dataset = ir_datasets.load('msmarco-segment-v2.1')
pt_dataset = pt.get_dataset("irds:msmarco-segment-v2.1")

In [5]:
def rename_segment(run):
    run = run.rename(columns={"segment": "text"})
    return run
rename_pipe = pt.apply.generic(rename_segment)

In [6]:
index = pta.Artifact.from_hf('namawho/msmarco-segment-v2.1.pisa')
bm25_ret = index.bm25(verbose=True) % 200 >> pt.text.get_text(pt_dataset, "segment") >> rename_pipe

In [7]:
retrieved = bm25_ret.transform(df)

PISA bm25: 100%|██████████| 198/198 [07:17<00:00,  2.21s/query]


In [8]:
retrieved["Q0"] = "Q0"
retrieved["tag"] = "bm25"
retrieved = retrieved[["qid","Q0","docno","rank","score","tag"]]

# scrivi su file
retrieved.to_csv("../data/raw/rag/__bm25__msmarco-segment-gpqa.run", sep=" ", index=False, header=False)

---

In [5]:
print("🔍 Carico ranking esistente...")
df_run_base = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-base__msmarco-segment-gpqa.tsv", sep="\t",
    names=["qid", "Q0", "doc_id", "rank", "score", "run_name", "text"]
)
df_run_ea = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-ea__msmarco-segment-gpqa.tsv", sep="\t",
    names=["qid", "Q0", "doc_id", "rank", "score", "run_name", "text"]
)

🔍 Carico ranking esistente...


In [6]:
dataset = ir_datasets.load('msmarco-segment-v2.1')
pt_dataset = pt.get_dataset("irds:msmarco-segment-v2.1")
total_docs = dataset.docs_count() 
all_docs = list(tqdm(dataset.docs_iter(), total=total_docs))

100%|██████████| 113520750/113520750 [50:15<00:00, 37645.17it/s] 


In [7]:
doc_dict = {doc.doc_id: doc for doc in all_docs}

In [8]:
from pyterrier_rag.backend import OpenAIBackend
from pyterrier_rag.prompt import Concatenator
from pyterrier_rag.readers import Reader
from pyterrier_rag.prompt import PromptTransformer, prompt
from fastchat.model import get_conversation_template

In [9]:
#model_name = "llama-3-8b-instruct"
model_name = "llama-3.3-70b-instruct"

In [26]:
system_message = r"""You are an expert Q&A system that is trusted around the world. 
        Always answer the query using the provided context information, and not prior knowledge.

        Some rules to follow:
        1. Never directly reference the given context in your answer
        2. Avoid statements like 'Based on the context, ...' or 
        'The context information ...' or anything along those lines."""
prompt_text = """Context information is below.
            ---------------------
            {{ qcontext }}
            ---------------------
            Given the context information, answer to the given query.
            
            Query: {{ query }}

            Do not include any explanation or additional text.

            Answer: """

template = get_conversation_template("meta-llama-3.1-sp")
prompt = PromptTransformer(
    conversation_template=template,
    system_message=system_message,
    instruction=prompt_text,
    input_fields=["query", "qcontext"],
    api_type="openai"
)

In [32]:
def get_rank(df_queries):
    run = df_queries.merge(df_run_ea, on="qid", how="left")
    return run
get_rank_pipe = pt.apply.generic(get_rank)

In [28]:
from transformers import AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3-8b-instruct-awq")
tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3.3-70b-instruct-awq")

generation_args={
    "temperature": 0.6,
    "max_tokens": 140,
}

# this could equally be a real OpenAI models
llama = OpenAIBackend(model_name, 
                      api_key=os.environ['IDA_LLM_API_KEY'],
                      generation_args=generation_args,
                      base_url="http://api.llm.apps.os.dcs.gla.ac.uk/v1", 
                      verbose=True, 
                      parallel=64)

In [33]:
llama_reader = Reader(llama, prompt=prompt)
set_encoder_llama = get_rank_pipe % 5 >> Concatenator(tokenizer=tokenizer, max_length=8191,max_per_context=1638) >> llama_reader

In [None]:
from tqdm import tqdm

save_every=100
partial_save_path="../data/processed/rag/ea_gpqa_rag_output_cut_5.tsv"

try:
    df_partial = pd.read_csv(partial_save_path, sep="\t")
    done_qids = set(df_partial["qid"])
    print(f"✅ Ripresi {len(done_qids)} risultati da salvataggio parziale.")
    results = [df_partial]
except FileNotFoundError:
    print("🚨 Nessun salvataggio trovato, si parte da zero.")
    done_qids = set()
    results = []

remaining = df[~df["qid"].isin(done_qids)].reset_index(drop=True)
print(f"🧠 Da processare: {len(remaining)} esempi.")
    
for row in tqdm(remaining.iterrows(), total=len(remaining), desc="🔁 RAG on MMLU"):
    idx, data = row
    result = set_encoder_llama.transform(pd.DataFrame([data]))

    result_merged = result.merge(df[["qid", "gold_answer"]], on="qid", how="left")
    results.append(result_merged)

    # ✅ Salvataggio intermedio
    if (idx + 1) % save_every == 0 or (idx + 1) == len(remaining):
        df_save = pd.concat(results, ignore_index=True)
        df_save.to_csv(partial_save_path, sep="\t", index=False)
        print(f"💾 Salvati {len(df_save)} risultati su {partial_save_path}")
        results = [df_save]

🚨 Nessun salvataggio trovato, si parte da zero.
🧠 Da processare: 198 esempi.


🔁 RAG on MMLU:  51%|█████     | 100/198 [03:54<01:55,  1.17s/it]

💾 Salvati 100 risultati su ../data/processed/rag/ea_gpqa_rag_output_cut_5.tsv


🔁 RAG on MMLU:  84%|████████▍ | 166/198 [07:10<02:14,  4.20s/it]

In [34]:
pt.Experiment(
    [set_encoder_llama],
    df[['qid', 'query']], # NB: remove .head() to run on all dev topics
    df[['qid', 'gold_answer']],
    [ptr.measures.F1, ptr.measures.EM],
    batch_size=25,
    verbose=True,
    names=['set encoder'],
)

pt.Experiment: 100%|██████████| 8/8 [05:35<00:00, 41.99s/batches]


Unnamed: 0,name,F1,EM
0,set encoder,0.127162,0.035354


# Score@3

In [None]:
# name	F1	EM
# set encoder base	0.134688	0.040404

In [None]:
# name	F1	EM
# 0	set encoder ea	0.133429	0.040404

# Score@5

In [None]:
# 	name	F1	EM
#   set encoder	base 0.128475	0.035354


In [None]:
# name	F1	EM
# 0	set encoder ea	0.127162	0.035354