In [1]:
import os
import pyterrier as pt
import pyterrier_rag as ptr
from datasets import load_dataset
import ir_datasets
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np

In [2]:
# ----------------------------------------------------------------------------
# 2. Caricamento dataset MMLU
# ----------------------------------------------------------------------------
print("🔍 Carico MMLU da Hugging Face...")
mmlu = load_dataset("cais/mmlu", "all", split="test")  # HF loader :contentReference[oaicite:7]{index=7}
# Rinomina e prepara DataFrame
#df_mmlu = pd.DataFrame(mmlu) \
#    .rename(columns={"question": "query", "answer": "gold_answer"}) \
#    .assign(qid=lambda df: df.index.astype(str))


# Costruisci il DataFrame e imposta il nuovo qid = subject + "_" + index
df_mmlu = (
    pd.DataFrame(mmlu)
      # crea qid unendo subject e index
      .assign(qid=lambda df: df["subject"] + "_" + df.index.astype(str))
      # rinomina le colonne per lo script RAG
      .rename(columns={
          "question": "query",
          "answer": "gold_answer"
      })
)

print("🔍 Carico ranking esistente...")
df_run_base = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-base__msmarco-segment-mmlu.tsv", sep="\t",
    names=["qid", "Q0", "doc_id", "rank", "score", "run_name", "text"]
)
df_run_ea = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-ea__msmarco-segment-mmlu.tsv", sep="\t",
    names=["qid", "Q0", "doc_id", "rank", "score", "run_name", "text"]
)

🔍 Carico MMLU da Hugging Face...
🔍 Carico ranking esistente...


In [3]:
dataset = ir_datasets.load('msmarco-segment-v2.1')
pt_dataset = pt.get_dataset("irds:msmarco-segment-v2.1")
total_docs = dataset.docs_count() 
all_docs = list(tqdm(dataset.docs_iter(), total=total_docs))

100%|██████████| 113520750/113520750 [43:27<00:00, 43539.88it/s] 


In [4]:
doc_dict = {doc.doc_id: doc for doc in all_docs}

In [5]:
from pyterrier_rag.backend import OpenAIBackend
from pyterrier_rag.prompt import Concatenator
from pyterrier_rag.readers import Reader
from pyterrier_rag.prompt import PromptTransformer, prompt
from fastchat.model import get_conversation_template

In [6]:
#model_name = "llama-3-8b-instruct"
model_name = "llama-3.3-70b-instruct"

In [7]:
system_message = r"""You are an expert Q&A system that is trusted around the world. 
        Always answer the query using the provided context information,
        and not prior knowledge.

        Some rules to follow:
        1. Never directly reference the given context in your answer
        2. Avoid statements like 'Based on the context, ...' or 
        'The context information ...' or anything along those lines.
        3. Output must be a single uppercase letter: A, B, C, or D — nothing else."""
prompt_text = """Context information is below.
            ---------------------
            {{ qcontext }}
            ---------------------
            Given the context information and a multiple-choice question, choose the correct answer.
            
            Query: {{ query }}

            Choices: {{ choices }}

            Answer with only the letter ["A", "B", "C", "D"] corresponding to the correct choice, with no mention of "". Do not include any explanation or additional text.

            Answer: """

template = get_conversation_template("meta-llama-3.1-sp")
prompt = PromptTransformer(
    conversation_template=template,
    system_message=system_message,
    instruction=prompt_text,
    input_fields=["query", "qcontext", 'choices'],
    api_type="openai"
)

In [8]:
def get_rank(df_queries):
    run = df_queries.merge(df_run_base, on="qid", how="left")
    return run
get_rank_pipe = pt.apply.generic(get_rank)

def get_rank_common_docs(df_queries, k=5):
    rows = []

    for _, row in df_queries.iterrows():
        qid = row["qid"]
        docs_base = set(df_run_base[df_run_base["qid"] == qid].sort_values("rank").head(k)["doc_id"])
        docs_ea = set(df_run_ea[df_run_ea["qid"] == qid].sort_values("rank").head(k)["doc_id"])

        common_docs = docs_base & docs_ea

        if len(common_docs) == 0:
            final_docs = list(docs_ea)
        else:
            n_missing = k - len(common_docs)
    
            random_docs = []
            while len(random_docs) < n_missing:
                random_doc = all_docs[np.random.randint(0, len(all_docs))]
                if random_doc.doc_id not in docs_base and random_doc.doc_id not in docs_ea:
                    print(random_doc.doc_id)
                    random_docs.append(random_doc.doc_id)
    
            final_docs = list(common_docs) + random_docs
        
        for rank, doc_id in enumerate(final_docs):
            rows.append({
                "query": row['query'],
                "qid": qid,
                "doc_id": doc_id,
                "rank": rank,
                "score": 1.0 - 0.01 * rank,  # dummy score
                "text": doc_dict.get(doc_id).segment, 
                "choices": row['choices']
            })

    return pd.DataFrame(rows)

get_rank_commondocs_pipe = pt.apply.generic(get_rank_common_docs)

In [9]:
from transformers import AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3-8b-instruct-awq")
tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3.3-70b-instruct-awq")

generation_args={
    "temperature": 0.01,
    "max_tokens": 1,
}

# this could equally be a real OpenAI models
llama = OpenAIBackend(model_name, 
                      api_key=os.environ['IDA_LLM_API_KEY'],
                      generation_args=generation_args,
                      base_url="http://api.llm.apps.os.dcs.gla.ac.uk/v1", 
                      verbose=True, 
                      parallel=64)

In [10]:
llama_reader = Reader(llama, prompt=prompt)
#set_encoder_llama = get_rank_pipe % 3 >> Concatenator(tokenizer=tokenizer, max_length=8191,max_per_context=819,additional_fields=["choices"]) >> llama_reader
set_encoder_llama = get_rank_commondocs_pipe % 5 >> Concatenator(tokenizer=tokenizer, max_length=8191,max_per_context=2730,additional_fields=["choices"]) >> llama_reader

In [11]:
#print("⚙️ Esecuzione pipeline RAG su MMLU…")
#results = set_encoder_llama.transform(df_mmlu.head(1000))

from tqdm import tqdm

save_every=100
partial_save_path="../data/processed/rag/commondocs_mmlu_rag_output_cut_5.tsv"

try:
    df_partial = pd.read_csv(partial_save_path, sep="\t")
    done_qids = set(df_partial["qid"])
    print(f"✅ Ripresi {len(done_qids)} risultati da salvataggio parziale.")
    results = [df_partial]
except FileNotFoundError:
    print("🚨 Nessun salvataggio trovato, si parte da zero.")
    done_qids = set()
    results = []

remaining = df_mmlu[~df_mmlu["qid"].isin(done_qids)].reset_index(drop=True)
print(f"🧠 Da processare: {len(remaining)} esempi.")
    
for row in tqdm(remaining.iterrows(), total=len(remaining), desc="🔁 RAG on MMLU"):
    idx, data = row
    result = set_encoder_llama.transform(pd.DataFrame([data]))

    result_merged = result.merge(df_mmlu[["qid", "gold_answer"]], on="qid", how="left")
    results.append(result_merged)

    # ✅ Salvataggio intermedio
    if (idx + 1) % save_every == 0 or (idx + 1) == len(remaining):
        df_save = pd.concat(results, ignore_index=True)
        df_save.to_csv(partial_save_path, sep="\t", index=False)
        print(f"💾 Salvati {len(df_save)} risultati su {partial_save_path}")
        results = [df_save]

🚨 Nessun salvataggio trovato, si parte da zero.
🧠 Da processare: 14042 esempi.


🔁 RAG on MMLU:   0%|          | 1/14042 [00:51<201:08:34, 51.57s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}
msmarco_v2.1_doc_23_784101806#11_1731332762
msmarco_v2.1_doc_50_2286475756#1_3087351484


🔁 RAG on MMLU:   0%|          | 2/14042 [01:42<198:45:31, 50.96s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}
msmarco_v2.1_doc_17_851997906#11_937656270


🔁 RAG on MMLU:   0%|          | 3/14042 [02:29<191:32:43, 49.12s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}
msmarco_v2.1_doc_14_172303228#14_393715007


🔁 RAG on MMLU:   0%|          | 4/14042 [03:19<192:52:14, 49.46s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}
msmarco_v2.1_doc_31_1421966689#8_2886213402
msmarco_v2.1_doc_34_893571898#3_1837576901


🔁 RAG on MMLU:   0%|          | 5/14042 [04:08<192:32:59, 49.38s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}
msmarco_v2.1_doc_57_250153360#10_542239871
msmarco_v2.1_doc_21_1495833963#8_3276351120
msmarco_v2.1_doc_13_1085699448#2_2409530484
msmarco_v2.1_doc_32_547801233#3_1043329466


🔁 RAG on MMLU:   0%|          | 6/14042 [04:54<188:17:05, 48.29s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}


🔁 RAG on MMLU:   0%|          | 7/14042 [05:41<187:01:14, 47.97s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}


🔁 RAG on MMLU:   0%|          | 8/14042 [06:31<188:43:10, 48.41s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}


🔁 RAG on MMLU:   0%|          | 9/14042 [07:18<187:46:21, 48.17s/it]

Error code: 503 - {'error': 'llama-3.3-70b-instruct is starting up. Please wait a minute and try again.'}


🔁 RAG on MMLU:   0%|          | 10/14042 [07:56<175:11:55, 44.95s/it]

msmarco_v2.1_doc_11_1296760589#2_2705118181


🔁 RAG on MMLU:   0%|          | 11/14042 [07:58<123:50:26, 31.77s/it]

msmarco_v2.1_doc_05_1445117423#9_2773625560


🔁 RAG on MMLU:   0%|          | 11/14042 [07:58<169:38:32, 43.53s/it]

Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 8192 tokens. However, you requested 8385 tokens (8384 in the messages, 1 in the completion). Please reduce the length of the messages or completion. None", 'type': 'BadRequestError', 'param': None, 'code': 400}





AttributeError: 'dict' object has no attribute 'num_responses'

In [None]:
# ----------------------------------------------------------------------------
# 6. Valutazione
# ----------------------------------------------------------------------------

if isinstance(results, list):
    results = pd.concat(results, ignore_index=True)
    
# 🔁 Converte i valori numerici in lettere
index_to_choice = {0: "A", 1: "B", 2: "C", 3: "D"}
results["gold_answer"] = results["gold_answer"].map(index_to_choice)

def evaluate(preds, golds):
    preds = [str(p).strip().lower() for p in preds]
    golds = [str(g).strip().lower() for g in golds]
    return accuracy_score(golds, preds)

acc = evaluate(results["qanswer"], results["gold_answer"])
print(f"\n📊 MMLU Accuracy (EM): {acc:.4f}")

In [None]:
# Estrai il subject da qid
results["subject"] = results["qid"].apply(lambda x: "_".join(x.split("_")[:-1]))

# Calcola accuracy per subject e raccoglila in una lista di dizionari
subject_stats = []

for subject in sorted(results["subject"].unique()):
    sub_df = results[results["subject"] == subject]
    acc_sub = evaluate(sub_df["qanswer"], sub_df["gold_answer"])
    subject_stats.append({
        "subject": subject,
        "num_questions": len(sub_df),
        "accuracy": acc_sub
    })

# Crea un DataFrame
subject_df = pd.DataFrame(subject_stats).sort_values(by="accuracy", ascending=False).reset_index(drop=True)

# Mostra
print("\n📊 Accuracy per subject:")
subject_df