In [6]:
import os
import pyterrier as pt
import pyterrier_rag as ptr
from datasets import load_dataset
import ir_datasets
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np

In [7]:
# ----------------------------------------------------------------------------
# 2. Caricamento dataset MMLU
# ----------------------------------------------------------------------------
print("üîç Carico MMLU da Hugging Face...")
mmlu = load_dataset("cais/mmlu", "all", split="test")  # HF loader :contentReference[oaicite:7]{index=7}
# Rinomina e prepara DataFrame
#df_mmlu = pd.DataFrame(mmlu) \
#    .rename(columns={"question": "query", "answer": "gold_answer"}) \
#    .assign(qid=lambda df: df.index.astype(str))


# Costruisci il DataFrame e imposta il nuovo qid = subject + "_" + index
df_mmlu = (
    pd.DataFrame(mmlu)
      # crea qid unendo subject e index
      .assign(qid=lambda df: df["subject"] + "_" + df.index.astype(str))
      # rinomina le colonne per lo script RAG
      .rename(columns={
          "question": "query",
          "answer": "gold_answer"
      })
)

print("üîç Carico ranking esistente...")
df_run_base = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-base__msmarco-segment-mmlu.tsv", sep="\t",
    names=["qid", "Q0", "doc_id", "rank", "score", "run_name", "text"]
)
df_run_ea = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-ea__msmarco-segment-mmlu.tsv", sep="\t",
    names=["qid", "Q0", "doc_id", "rank", "score", "run_name", "text"]
)

üîç Carico MMLU da Hugging Face...
üîç Carico ranking esistente...


In [None]:
dataset = ir_datasets.load('msmarco-segment-v2.1')
pt_dataset = pt.get_dataset("irds:msmarco-segment-v2.1")
total_docs = dataset.docs_count() 
all_docs = list(tqdm(dataset.docs_iter(), total=total_docs))

  1%|          | 897996/113520750 [00:33<1:00:15, 31147.07it/s]

In [None]:
doc_dict = {doc.doc_id: doc for doc in all_docs}

In [None]:
from pyterrier_rag.backend import OpenAIBackend
from pyterrier_rag.prompt import Concatenator
from pyterrier_rag.readers import Reader
from pyterrier_rag.prompt import PromptTransformer, prompt
from fastchat.model import get_conversation_template

In [None]:
#model_name = "llama-3-8b-instruct"
model_name = "llama-3.3-70b-instruct"

In [None]:
system_message = r"""You are an expert Q&A system that is trusted around the world. 
        Always answer the query using the provided context information,
        and not prior knowledge.

        Some rules to follow:
        1. Never directly reference the given context in your answer
        2. Avoid statements like 'Based on the context, ...' or 
        'The context information ...' or anything along those lines.
        3. Output must be a single uppercase letter: A, B, C, or D ‚Äî nothing else."""
prompt_text = """Context information is below.
            ---------------------
            {{ qcontext }}
            ---------------------
            Given the context information and a multiple-choice question, choose the correct answer.
            
            Query: {{ query }}

            Choices: {{ choices }}

            Answer with only the letter ["A", "B", "C", "D"] corresponding to the correct choice, with no mention of "". Do not include any explanation or additional text.

            Answer: """

template = get_conversation_template("meta-llama-3.1-sp")
prompt = PromptTransformer(
    conversation_template=template,
    system_message=system_message,
    instruction=prompt_text,
    input_fields=["query", "qcontext", 'choices'],
    api_type="openai"
)

In [None]:
def get_rank(df_queries):
    run = df_queries.merge(df_run_base, on="qid", how="left")
    return run
get_rank_pipe = pt.apply.generic(get_rank)

def get_rank_common_docs(df_queries, k=5):
    rows = []

    for _, row in df_queries.iterrows():
        qid = row["qid"]
        docs_base = set(df_run_base[df_run_base["qid"] == qid].sort_values("rank").head(k)["doc_id"])
        docs_ea = set(df_run_ea[df_run_ea["qid"] == qid].sort_values("rank").head(k)["doc_id"])

        common_docs = docs_base & docs_ea

        if len(common_docs) == 0:
            final_docs = list(docs_ea)
        else:
            n_missing = k - len(common_docs)
    
            random_docs = []
            while len(random_docs) < n_missing:
                random_doc = all_docs[np.random.randint(0, len(all_docs))]
                if random_doc.doc_id not in docs_base and random_doc.doc_id not in docs_ea:
                    print(random_doc.doc_id)
                    random_docs.append(random_doc.doc_id)
    
            final_docs = list(common_docs) + random_docs
        
        for rank, doc_id in enumerate(final_docs):
            rows.append({
                "query": row['query'],
                "qid": qid,
                "doc_id": doc_id,
                "rank": rank,
                "score": 1.0 - 0.01 * rank,  # dummy score
                "text": doc_dict.get(doc_id).segment, 
                "choices": row['choices']
            })

    return pd.DataFrame(rows)

get_rank_commondocs_pipe = pt.apply.generic(get_rank_common_docs)

def get_top1_plus_random(df_queries, k=3):
    rows = []

    for _, row in df_queries.iterrows():
        qid = row["qid"]

        # 1. Recupera il primo documento per la query
        top_doc = df_run_ea[df_run_ea["qid"] == qid].sort_values("rank").iloc[0]
        final_docs = [top_doc.doc_id]

        # 2. Aggiungi documenti random da all_docs (evitando duplicati)
        n_needed = k - 1
        added = 0
        random_docs = []

        while added < n_needed:
            random_doc = all_docs[np.random.randint(0, len(all_docs))]
            if random_doc.doc_id not in final_docs:
                random_docs.append(random_doc.doc_id)
                final_docs.append(random_doc.doc_id)
                added += 1

        # 3. Costruisci le righe per questa query
        for rank, doc_id in enumerate(final_docs):
            rows.append({
                "query": row["query"],
                "qid": qid,
                "doc_id": doc_id,
                "rank": rank,
                "score": 1.0 - 0.01 * rank,  # dummy score decrescente
                "text": doc_dict.get(doc_id).segment,
                "choices": row["choices"]
            })

    return pd.DataFrame(rows)

get_rank_top1_plus_random_pipe = pt.apply.generic(get_top1_plus_random)

In [None]:
from transformers import AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3-8b-instruct-awq")
tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3.3-70b-instruct-awq")

generation_args={
    "temperature": 0.01,
    "max_tokens": 1,
}

# this could equally be a real OpenAI models
llama = OpenAIBackend(model_name, 
                      api_key=os.environ['IDA_LLM_API_KEY'],
                      generation_args=generation_args,
                      base_url="http://api.llm.apps.os.dcs.gla.ac.uk/v1", 
                      verbose=True, 
                      parallel=64)

In [None]:
llama_reader = Reader(llama, prompt=prompt)
#set_encoder_llama = get_rank_pipe % 3 >> Concatenator(tokenizer=tokenizer, max_length=8191,max_per_context=819,additional_fields=["choices"]) >> llama_reader
#set_encoder_llama = get_rank_commondocs_pipe % 5 >> Concatenator(tokenizer=tokenizer, max_length=8191,max_per_context=2730,additional_fields=["choices"]) >> llama_reader
set_encoder_llama = get_rank_top1_plus_random_pipe % 3 >> Concatenator(tokenizer=tokenizer, max_length=8191,max_per_context=2730,additional_fields=["choices"]) >> llama_reader

In [None]:
#print("‚öôÔ∏è Esecuzione pipeline RAG su MMLU‚Ä¶")
#results = set_encoder_llama.transform(df_mmlu.head(1000))

from tqdm import tqdm

save_every=100
partial_save_path="../data/processed/rag/ea_top1plusrandom_mmlu_rag_output_cut_3.tsv"

try:
    df_partial = pd.read_csv(partial_save_path, sep="\t")
    done_qids = set(df_partial["qid"])
    print(f"‚úÖ Ripresi {len(done_qids)} risultati da salvataggio parziale.")
    results = [df_partial]
except FileNotFoundError:
    print("üö® Nessun salvataggio trovato, si parte da zero.")
    done_qids = set()
    results = []

remaining = df_mmlu[~df_mmlu["qid"].isin(done_qids)].reset_index(drop=True)
print(f"üß† Da processare: {len(remaining)} esempi.")
    
for row in tqdm(remaining.iterrows(), total=len(remaining), desc="üîÅ RAG on MMLU"):
    idx, data = row
    result = set_encoder_llama.transform(pd.DataFrame([data]))

    result_merged = result.merge(df_mmlu[["qid", "gold_answer"]], on="qid", how="left")
    results.append(result_merged)

    # ‚úÖ Salvataggio intermedio
    if (idx + 1) % save_every == 0 or (idx + 1) == len(remaining):
        df_save = pd.concat(results, ignore_index=True)
        df_save.to_csv(partial_save_path, sep="\t", index=False)
        print(f"üíæ Salvati {len(df_save)} risultati su {partial_save_path}")
        results = [df_save]

In [None]:
# ----------------------------------------------------------------------------
# 6. Valutazione
# ----------------------------------------------------------------------------

if isinstance(results, list):
    results = pd.concat(results, ignore_index=True)
    
# üîÅ Converte i valori numerici in lettere
index_to_choice = {0: "A", 1: "B", 2: "C", 3: "D"}
results["gold_answer"] = results["gold_answer"].map(index_to_choice)

def evaluate(preds, golds):
    preds = [str(p).strip().lower() for p in preds]
    golds = [str(g).strip().lower() for g in golds]
    return accuracy_score(golds, preds)

acc = evaluate(results["qanswer"], results["gold_answer"])
print(f"\nüìä MMLU Accuracy (EM): {acc:.4f}")

In [None]:
# Estrai il subject da qid
results["subject"] = results["qid"].apply(lambda x: "_".join(x.split("_")[:-1]))

# Calcola accuracy per subject e raccoglila in una lista di dizionari
subject_stats = []

for subject in sorted(results["subject"].unique()):
    sub_df = results[results["subject"] == subject]
    acc_sub = evaluate(sub_df["qanswer"], sub_df["gold_answer"])
    subject_stats.append({
        "subject": subject,
        "num_questions": len(sub_df),
        "accuracy": acc_sub
    })

# Crea un DataFrame
subject_df = pd.DataFrame(subject_stats).sort_values(by="accuracy", ascending=False).reset_index(drop=True)

# Mostra
print("\nüìä Accuracy per subject:")
subject_df

---

In [4]:
# Statistic tests

import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar

# 1) Carica le due run (gi√† fatto)
df_run_base = pd.read_csv("../data/processed/rag/base_mmlu_rag_output_cut_5.tsv", sep="\t")
df_run_ea   = pd.read_csv("../data/processed/rag/ea_mmlu_rag_output_cut_5.tsv",   sep="\t")

# 2) Normalizza colonne e chiave d‚Äôallineamento
def norm_ans(x):
    m = {0:"A",1:"B",2:"C",3:"D"}
    s = str(x).strip().upper()
    return m.get(int(s), s) if s.isdigit() and int(s) in m else s

for df in (df_run_base, df_run_ea):
    df["qanswer"] = df["qanswer"].map(norm_ans)
    df["gold_answer"] = df["gold_answer"].map(norm_ans)

key = "question_id" if "question_id" in df_run_base.columns and "question_id" in df_run_ea.columns else \
      "question"     if "question"     in df_run_base.columns and "question"     in df_run_ea.columns else None
if key is None:
    df_run_base = df_run_base.reset_index().rename(columns={"index":"row_id"})
    df_run_ea   = df_run_ea.reset_index().rename(columns={"index":"row_id"})
    key = "row_id"

df = df_run_base[[key,"qanswer","gold_answer"]].merge(
        df_run_ea[[key,"qanswer"]], on=key, suffixes=("_base","_ea"), how="inner"
     )

# 3) Accuracy per modello
acc_base = (df["qanswer_base"] == df["gold_answer"]).mean()
acc_ea   = (df["qanswer_ea"]   == df["gold_answer"]).mean()
delta    = acc_ea - acc_base
print(f"Accuracy  BASE: {acc_base:.4f} | EA: {acc_ea:.4f} | Œî (EA-BASE): {delta:.4f}")

# 4) Tabella 2x2 per McNemar
base_ok = df["qanswer_base"] == df["gold_answer"]
ea_ok   = df["qanswer_ea"]   == df["gold_answer"]

a = int(( base_ok &  ea_ok).sum())  # entrambi corretti
b = int(( base_ok & ~ea_ok).sum())  # BASE solo corretto
c = int((~base_ok &  ea_ok).sum())  # EA   solo corretto
d = int((~base_ok & ~ea_ok).sum())  # entrambi sbagliati
table = [[a, b],[c, d]]
print(f"Contingency: [[a={a}, b={b}], [c={c}, d={d}]]")

# 5) McNemar (esatto se b+c piccolo, altrimenti chi-quadro con correzione di continuit√†)
use_exact = (b + c) <= 25
res = mcnemar(table, exact=use_exact, correction=not use_exact)
print(f"McNemar p-value: {res.pvalue:.6g}  ({'exact' if use_exact else 'chi2+cc'})")


Accuracy  BASE: 0.7716 | EA: 0.7702 | Œî (EA-BASE): -0.0014
Contingency: [[a=10421, b=414], [c=394, d=2813]]
McNemar p-value: 0.503867  (chi2+cc)
