# Nuggets creation and scoring

In [1]:
from pyterrier_rag.backend import OpenAIBackend
from transformers import AutoTokenizer
import os

model_name = "llama-3.3-70b-instruct"
#model_name = "qwen-2.5-72b-instruct"

tokenizer = AutoTokenizer.from_pretrained("casperhansen/llama-3.3-70b-instruct-awq")
#tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct-AWQ")

generation_args={
    "temperature": 0.6,
    "max_tokens": 256,
}

# this could equally be a real OpenAI models
backend = OpenAIBackend(model_name, 
                      api_key=os.environ['IDA_LLM_API_KEY'],
                      generation_args=generation_args,
                      base_url="http://api.llm.apps.os.dcs.gla.ac.uk/v1", 
                      verbose=True, 
                      parallel=64)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

def save_csv(path, content):
    content.to_csv(path, index=False)

def load_csv(path):
    try:
        content = pd.read_csv(path)
        return content
    except Exception:
        return None

In [3]:
from open_nuggetizer.nuggetizer import Nuggetizer
from open_nuggetizer._types import NuggetAssignMode
from fastchat.model import get_conversation_template

conv_template = get_conversation_template("meta-llama-3.1-sp")

nuggetizer = Nuggetizer(
    backend=backend, 
    conversation_template=conv_template,
    verbose=True,
    assigner_mode=NuggetAssignMode.SUPPORT_GRADE_3
)

#nuggets = load_csv("../data/processed/rag/nuggets.jsonl")
#if nuggets is None:
#    nuggets = nuggetizer.create(baseline)
#    save_csv("nuggets.csv", nuggets)

scored_nuggets = load_csv("../data/processed/rag/scored_nuggets.csv")
if scored_nuggets is None:
    scored_nuggets = nuggetizer.score(nuggets)
    save_csv("scored_nuggets.csv", scored_nuggets)

# Evaluation

In [35]:
import pyterrier as pt

print("🔍 Carico ranking esistente...")
df_run_base = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-base__msmarco-segment-trecrag24.tsv", sep="\t",
    names=["qid", "Q0", "docno", "rank", "score", "run_name"]
)
df_run_ea = pd.read_csv(
    "../data/processed/rag/__setencoder-novelty-ea__msmarco-segment-trecrag24.tsv", sep="\t",
    names=["qid", "Q0", "docno", "rank", "score", "run_name"]
)

df_queries = pd.read_csv("../data/raw/rag/trecrag24-queries.tsv", sep="\t", names=["qid", "query"])
df_queries = df_queries[df_queries["qid"].isin(scored_nuggets["qid"])]

def get_rank(df_queries):
    run = df_queries.merge(df_run_base, on="qid", how="left")
    return run
get_rank_pipe = pt.apply.generic(get_rank)

def rename_segment(run):
    run = run.rename(columns={"segment": "text"})
    return run
rename_pipe = pt.apply.generic(rename_segment)

🔍 Carico ranking esistente...


In [5]:
import ir_datasets 
dataset = ir_datasets.load('msmarco-segment-v2.1')
pt_dataset = pt.get_dataset("irds:msmarco-segment-v2.1")

In [6]:
import pyterrier_alpha as pta
index = pta.Artifact.from_hf('namawho/msmarco-segment-v2.1.pisa')

In [36]:
from pyterrier_rag.prompt import Concatenator
from pyterrier_rag.readers import Reader
from pyterrier_rag.prompt import PromptTransformer
from jinja2 import Template

def make_callable_template(template: Template):
    def template_call(**kwargs):
        return template.render(**kwargs)

    return template_call

GENERIC_PROMPT = Template(
    "Use the context information to answer the Question: \n Context: {{ context }} \n Question: {{ query }} \n Answer:"
)

prompt = PromptTransformer(
            instruction=make_callable_template(GENERIC_PROMPT),
            system_message="You are an helpful assistant.",
            conversation_template=conv_template,
            input_fields=[
                "qcontext",
                "query",
            ],
        )

reader = Reader(backend, prompt)
rag_pipeline = get_rank_pipe % 10 >>  pt.text.get_text(pt_dataset, "segment") >> rename_pipe >> Concatenator() >> reader

In [37]:
results = (rag_pipeline)(df_queries)
results

Unnamed: 0,prompt,qid,query_0,qanswer
0,"[{'role': 'system', 'content': 'You are an hel...",2024-145979,what is vicarious trauma and how can it be cop...,"Vicarious trauma, also known as secondary trau..."
1,"[{'role': 'system', 'content': 'You are an hel...",2024-216592,why disability insurance is a smart investment,Disability insurance is a smart investment for...
2,"[{'role': 'system', 'content': 'You are an hel...",2024-32912,how bad did the vietnam war devastate the econ...,The Vietnam War had a significant impact on th...
3,"[{'role': 'system', 'content': 'You are an hel...",2024-153051,what target stors's policies for shoplifting,"Based on the context, I'll provide an answer r..."
4,"[{'role': 'system', 'content': 'You are an hel...",2024-79081,how taylor swift's age affects her relationships,"Based on the context of Taylor Swift's life, h..."
5,"[{'role': 'system', 'content': 'You are an hel...",2024-42497,how does the informal recycling sector add to ...,The informal recycling sector contributes to a...
6,"[{'role': 'system', 'content': 'You are an hel...",2024-158743,what was happening in germany and netherlands ...,"In the 1840s, several significant events were ..."
7,"[{'role': 'system', 'content': 'You are an hel...",2024-143869,what is scientific evidence for or against the...,There is significant scientific evidence both ...
8,"[{'role': 'system', 'content': 'You are an hel...",2024-43037,how europe solved its russian gas problem,To answer how Europe solved its Russian gas pr...
9,"[{'role': 'system', 'content': 'You are an hel...",2024-43983,how has inclusivity made vogue magazine more p...,Vogue magazine has become more popular due to ...


In [53]:
scored_nuggets = scored_nuggets.rename(columns={"query_id": "qid"})
results = results.rename(columns={"query_id": "qid", "query_0": "query"})
scores = []

for element in nuggetizer.AllScore().iter_calc(scored_nuggets, results):
    print(f"Query ID: {element.query_id}, Measure: {element.measure}, Value: {element.value}")
    scores.append(element.value)

pt.apply.by_query():   0%|          | 0/779 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.45s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.84s/window][A
pt.apply.by_query():   2%|▏         | 16/779 [00:03<02:55,  4.35it/s]

Assignments: [1, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 1, 1, 1, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.32s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.14s/window][A
pt.apply.by_query():   5%|▍         | 36/779 [00:07<02:43,  4.54it/s]

Assignments: [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.09s/window][A
pt.apply.by_query():   6%|▌         | 45/779 [00:10<02:44,  4.47it/s]

Assignments: [2, 0, 2, 0, 0, 2, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.36s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.78s/window][A
pt.apply.by_query():   8%|▊         | 59/779 [00:13<02:49,  4.24it/s]

Assignments: [0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.43s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.80s/window][A
pt.apply.by_query():  10%|▉         | 75/779 [00:17<02:43,  4.31it/s]

Assignments: [2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 1, 0, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.45s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.86s/window][A
pt.apply.by_query():  12%|█▏        | 90/779 [00:20<02:43,  4.22it/s]

Assignments: [0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 2, 1, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.98s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.11s/window][A
pt.apply.by_query():  14%|█▍        | 108/779 [00:25<02:38,  4.23it/s]

Assignments: [0, 0, 0, 1, 0, 2, 0, 0, 2, 2, 2, 1, 2, 0, 0, 0, 1, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.41s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.13s/window][A
pt.apply.by_query():  16%|█▋        | 128/779 [00:29<02:28,  4.39it/s]

Assignments: [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.30s/window][A
pt.apply.by_query():  18%|█▊        | 138/779 [00:31<02:26,  4.38it/s]

Assignments: [2, 2, 0, 2, 0, 0, 0, 0, 0, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.42s/window][A
pt.apply.by_query():  19%|█▉        | 148/779 [00:34<02:26,  4.32it/s]

Assignments: [0, 0, 0, 0, 0, 1, 0, 0, 0, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.36s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.82s/window][A
pt.apply.by_query():  21%|██        | 163/779 [00:37<02:24,  4.25it/s]

Assignments: [1, 2, 1, 0, 2, 2, 1, 0, 2, 0, 0, 2, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.55s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.98s/window][A
pt.apply.by_query():  23%|██▎       | 179/779 [00:41<02:23,  4.18it/s]

Assignments: [0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 1, 0, 1, 1, 2, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.12s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.73s/window][A
pt.apply.by_query():  25%|██▍       | 192/779 [00:45<02:24,  4.05it/s]

Assignments: [0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 2, 0, 2]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:01<00:00,  1.50s/window][A
pt.apply.by_query():  26%|██▌       | 199/779 [00:46<02:19,  4.15it/s]

Assignments: [2, 2, 2, 2, 0, 0, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:01<00:00,  1.58s/window][A
pt.apply.by_query():  26%|██▋       | 205/779 [00:48<02:20,  4.08it/s]

Assignments: [2, 2, 2, 1, 1, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.13window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.54s/window][A
pt.apply.by_query():  28%|██▊       | 216/779 [00:51<02:24,  3.90it/s]

Assignments: [2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.11s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.72s/window][A
pt.apply.by_query():  29%|██▉       | 229/779 [00:54<02:22,  3.86it/s]

Assignments: [0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 1, 2, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.02s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.70s/window][A
pt.apply.by_query():  31%|███       | 242/779 [00:58<02:19,  3.84it/s]

Assignments: [0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.16s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.80s/window][A
pt.apply.by_query():  33%|███▎      | 255/779 [01:01<02:19,  3.77it/s]

Assignments: [0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.28s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.07s/window][A
pt.apply.by_query():  35%|███▌      | 275/779 [01:05<02:01,  4.15it/s]

Assignments: [2, 2, 0, 0, 1, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 1, 2]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.23s/window][A
pt.apply.by_query():  37%|███▋      | 285/779 [01:08<01:57,  4.22it/s]

Assignments: [2, 2, 2, 0, 0, 2, 2, 2, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.17s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.33s/window][A
pt.apply.by_query():  39%|███▉      | 303/779 [01:12<01:56,  4.08it/s]

Assignments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.13s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.72s/window][A
pt.apply.by_query():  40%|████      | 315/779 [01:16<01:58,  3.90it/s]

Assignments: [0, 0, 1, 2, 0, 0, 2, 2, 1, 2, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.85s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.13s/window][A
pt.apply.by_query():  43%|████▎     | 332/779 [01:20<01:53,  3.93it/s]

Assignments: [0, 0, 1, 1, 2, 0, 1, 0, 1, 0, 2, 0, 0, 0, 1, 0, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.72s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.05s/window][A
pt.apply.by_query():  45%|████▍     | 349/779 [01:24<01:47,  4.00it/s]

Assignments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.12window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.64s/window][A
pt.apply.by_query():  46%|████▋     | 361/779 [01:27<01:46,  3.91it/s]

Assignments: [0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.20s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.06s/window][A
pt.apply.by_query():  49%|████▉     | 381/779 [01:32<01:34,  4.21it/s]

Assignments: [0, 1, 0, 0, 1, 2, 1, 1, 0, 2, 2, 0, 0, 2, 0, 1, 1, 1, 1, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.32window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.63s/window][A
pt.apply.by_query():  50%|█████     | 392/779 [01:35<01:37,  3.99it/s]

Assignments: [0, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.17s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.32s/window][A
pt.apply.by_query():  53%|█████▎    | 410/779 [01:39<01:33,  3.95it/s]

Assignments: [0, 0, 2, 1, 0, 0, 2, 2, 2, 2, 0, 1, 1, 2, 1, 0, 2, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.23s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.90s/window][A
pt.apply.by_query():  54%|█████▍    | 423/779 [01:43<01:33,  3.79it/s]

Assignments: [0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.16s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.93s/window][A
pt.apply.by_query():  56%|█████▌    | 436/779 [01:47<01:33,  3.67it/s]

Assignments: [0, 2, 0, 1, 0, 2, 1, 2, 0, 0, 0, 1, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.83s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.13s/window][A
pt.apply.by_query():  58%|█████▊    | 453/779 [01:51<01:26,  3.77it/s]

Assignments: [0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.37s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.27s/window][A
pt.apply.by_query():  61%|██████    | 472/779 [01:56<01:18,  3.91it/s]

Assignments: [0, 2, 1, 0, 2, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.20window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.59s/window][A
pt.apply.by_query():  62%|██████▏   | 484/779 [01:59<01:16,  3.87it/s]

Assignments: [2, 2, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.86s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.05s/window][A
pt.apply.by_query():  64%|██████▍   | 500/779 [02:03<01:11,  3.88it/s]

Assignments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.40window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.58s/window][A
pt.apply.by_query():  66%|██████▌   | 511/779 [02:06<01:10,  3.78it/s]

Assignments: [0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.51s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.02s/window][A
pt.apply.by_query():  67%|██████▋   | 525/779 [02:10<01:09,  3.68it/s]

Assignments: [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.37s/window][A
pt.apply.by_query():  69%|██████▊   | 535/779 [02:13<01:04,  3.79it/s]

Assignments: [1, 2, 2, 2, 2, 2, 0, 0, 0, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.18s/window][A
pt.apply.by_query():  70%|██████▉   | 544/779 [02:15<01:00,  3.86it/s]

Assignments: [2, 0, 2, 2, 0, 0, 0, 0, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.24s/window][A
pt.apply.by_query():  71%|███████   | 553/779 [02:17<00:57,  3.90it/s]

Assignments: [2, 0, 0, 1, 0, 0, 0, 0, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.73s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.11s/window][A
pt.apply.by_query():  73%|███████▎  | 569/779 [02:21<00:54,  3.86it/s]

Assignments: [2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 2, 1, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.42s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.77s/window][A
pt.apply.by_query():  75%|███████▍  | 584/779 [02:25<00:49,  3.98it/s]

Assignments: [0, 1, 0, 0, 2, 0, 0, 1, 2, 2, 1, 2, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.69s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.10s/window][A
pt.apply.by_query():  77%|███████▋  | 600/779 [02:29<00:45,  3.92it/s]

Assignments: [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.04window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.73s/window][A
pt.apply.by_query():  79%|███████▊  | 612/779 [02:33<00:44,  3.79it/s]

Assignments: [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.44window/s][A
100%|██████████| 2/2 [00:03<00:00,  1.54s/window][A
pt.apply.by_query():  80%|███████▉  | 623/779 [02:36<00:41,  3.73it/s]

Assignments: [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.99s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.17s/window][A
pt.apply.by_query():  82%|████████▏ | 641/779 [02:40<00:35,  3.88it/s]

Assignments: [0, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.56s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.98s/window][A
pt.apply.by_query():  84%|████████▍ | 656/779 [02:44<00:31,  3.85it/s]

Assignments: [0, 1, 0, 0, 1, 2, 1, 1, 0, 0, 2, 0, 0, 0, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.62s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.94s/window][A
pt.apply.by_query():  86%|████████▋ | 672/779 [02:48<00:27,  3.93it/s]

Assignments: [0, 0, 0, 2, 1, 2, 0, 2, 0, 2, 2, 2, 0, 2, 1, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.79s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.03s/window][A
pt.apply.by_query():  89%|████████▊ | 690/779 [02:52<00:21,  4.09it/s]

Assignments: [2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 1, 2, 2, 1, 0, 1, 2, 2]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.08s/window][A
100%|██████████| 2/2 [00:03<00:00,  1.72s/window][A
pt.apply.by_query():  90%|█████████ | 703/779 [02:55<00:19,  4.00it/s]

Assignments: [1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 1]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.70s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.02s/window][A
pt.apply.by_query():  92%|█████████▏| 719/779 [03:00<00:15,  3.98it/s]

Assignments: [1, 0, 2, 0, 0, 0, 2, 0, 1, 2, 1, 2, 1, 0, 2, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:01<00:00,  1.99s/window][A
pt.apply.by_query():  93%|█████████▎| 726/779 [03:02<00:13,  3.90it/s]

Assignments: [0, 0, 0, 0, 0, 0, 0]



  0%|          | 0/1 [00:00<?, ?window/s][A
100%|██████████| 1/1 [00:02<00:00,  2.27s/window][A
pt.apply.by_query():  94%|█████████▍| 736/779 [03:04<00:10,  4.01it/s]

Assignments: [2, 1, 0, 2, 0, 2, 0, 2, 0, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.80s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.09s/window][A
pt.apply.by_query():  97%|█████████▋| 752/779 [03:08<00:06,  3.94it/s]

Assignments: [0, 0, 1, 0, 0, 0, 2, 1, 2, 1, 1, 0, 2, 2, 1, 0]



  0%|          | 0/2 [00:00<?, ?window/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.75s/window][A
100%|██████████| 2/2 [00:04<00:00,  2.10s/window][A
pt.apply.by_query(): 100%|██████████| 779/779 [03:12<00:00,  4.04it/s]


Assignments: [1, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0]


100%|██████████| 2/2 [00:03<00:00,  1.70s/window]

Assignments: [0, 0, 1, 1, 2, 2, 0, 0, 1, 0, 1, 1]
Query ID: 2024-145979, Measure: AllScore, Value: 0.4
Query ID: 2024-216592, Measure: AllScore, Value: 0.15
Query ID: 2024-32912, Measure: AllScore, Value: 0.3333333333333333
Query ID: 2024-153051, Measure: AllScore, Value: 0.35714285714285715
Query ID: 2024-79081, Measure: AllScore, Value: 0.5625
Query ID: 2024-42497, Measure: AllScore, Value: 0.36666666666666664
Query ID: 2024-158743, Measure: AllScore, Value: 0.3611111111111111
Query ID: 2024-143869, Measure: AllScore, Value: 0.2
Query ID: 2024-43037, Measure: AllScore, Value: 0.3
Query ID: 2024-43983, Measure: AllScore, Value: 0.15
Query ID: 2024-224279, Measure: AllScore, Value: 0.43333333333333335
Query ID: 2024-214096, Measure: AllScore, Value: 0.40625
Query ID: 2024-219624, Measure: AllScore, Value: 0.3076923076923077
Query ID: 2024-73340, Measure: AllScore, Value: 0.5714285714285714
Query ID: 2024-44060, Measure: AllScore, Value: 0.75
Query ID: 2024-96359, Measure: AllScore, Val




In [54]:
import numpy as np
# Calcolo della media (e deviazione standard, se ti serve)
mean_vital = np.mean(scores)
std_vital = np.std(scores)
print(f"\n✅ Score medio: {mean_vital:.4f} (± {std_vital:.4f})")


✅ Score medio: 0.3469 (± 0.1817)


In [55]:
scores

[0.4,
 0.15,
 0.3333333333333333,
 0.35714285714285715,
 0.5625,
 0.36666666666666664,
 0.3611111111111111,
 0.2,
 0.3,
 0.15,
 0.43333333333333335,
 0.40625,
 0.3076923076923077,
 0.5714285714285714,
 0.75,
 0.8181818181818182,
 0.4230769230769231,
 0.38461538461538464,
 0.15384615384615385,
 0.5,
 0.6,
 0.08333333333333333,
 0.4166666666666667,
 0.3235294117647059,
 0.0,
 0.20833333333333334,
 0.425,
 0.4090909090909091,
 0.5555555555555556,
 0.3076923076923077,
 0.38461538461538464,
 0.17647058823529413,
 0.6578947368421053,
 0.375,
 0.25,
 0.18181818181818182,
 0.0,
 0.55,
 0.3333333333333333,
 0.2777777777777778,
 0.28125,
 0.36666666666666664,
 0.0625,
 0.0,
 0.09090909090909091,
 0.4722222222222222,
 0.3333333333333333,
 0.5,
 0.5277777777777778,
 0.46153846153846156,
 0.4375,
 0.0,
 0.45,
 0.40625,
 0.21875,
 0.375]

---

In [56]:
# === INPUT ===================================================================
# Sostituisci questi due dizionari con i tuoi vettori per-query
# Ogni valore deve essere una lista [score_q1, score_q2, ...] con stesso ordine di query
base = {
    # "metric_name": [scores per query...],
    # es: "nDCG@10": [0.45, 0.32, ...],
    "allscore": [0.4,
 0.15,
 0.3333333333333333,
 0.35714285714285715,
 0.5625,
 0.36666666666666664,
 0.3611111111111111,
 0.2,
 0.3,
 0.15,
 0.43333333333333335,
 0.40625,
 0.3076923076923077,
 0.5714285714285714,
 0.75,
 0.8181818181818182,
 0.4230769230769231,
 0.38461538461538464,
 0.15384615384615385,
 0.5,
 0.6,
 0.08333333333333333,
 0.4166666666666667,
 0.3235294117647059,
 0.0,
 0.20833333333333334,
 0.425,
 0.4090909090909091,
 0.5555555555555556,
 0.3076923076923077,
 0.38461538461538464,
 0.17647058823529413,
 0.6578947368421053,
 0.375,
 0.25,
 0.18181818181818182,
 0.0,
 0.55,
 0.3333333333333333,
 0.2777777777777778,
 0.28125,
 0.36666666666666664,
 0.0625,
 0.0,
 0.09090909090909091,
 0.4722222222222222,
 0.3333333333333333,
 0.5,
 0.5277777777777778,
 0.46153846153846156,
 0.4375,
 0.0,
 0.45,
 0.40625,
 0.21875,
 0.375],
    "allscorestrict": [0.2,
 0.1,
 0.3333333333333333,
 0.35714285714285715,
 0.5,
 0.3333333333333333,
 0.2222222222222222,
 0.2,
 0.3,
 0.1,
 0.3333333333333333,
 0.3125,
 0.15384615384615385,
 0.5714285714285714,
 0.5,
 0.45454545454545453,
 0.3076923076923077,
 0.38461538461538464,
 0.07692307692307693,
 0.4,
 0.6,
 0.05555555555555555,
 0.3333333333333333,
 0.23529411764705882,
 0.0,
 0.08333333333333333,
 0.2,
 0.2727272727272727,
 0.4444444444444444,
 0.3076923076923077,
 0.23076923076923078,
 0.058823529411764705,
 0.42105263157894735,
 0.25,
 0.25,
 0.09090909090909091,
 0.0,
 0.5,
 0.2222222222222222,
 0.2222222222222222,
 0.1875,
 0.26666666666666666,
 0.0625,
 0.0,
 0.09090909090909091,
 0.4444444444444444,
 0.2,
 0.4375,
 0.4444444444444444,
 0.38461538461538464,
 0.3125,
 0.0,
 0.3,
 0.25,
 0.125,
 0.16666666666666666],
    "vitalscore": [0.26666666666666666,
 0.0,
 0.5,
 0.16666666666666666,
 0.8,
 0.36363636363636365,
 0.21428571428571427,
 0.2,
 0.3333333333333333,
 0.0,
 0.42857142857142855,
 0.3333333333333333,
 0.16666666666666666,
 0.75,
 0.5,
 0.6666666666666666,
 0.38461538461538464,
 0.4166666666666667,
 0.0,
 0.5555555555555556,
 0.7142857142857143,
 0.0625,
 0.3333333333333333,
 0.0,
 0.0,
 0.0,
 0.25,
 0.36363636363636365,
 0.5,
 0.4,
 0.3,
 0.08333333333333333,
 0.2857142857142857,
 0.3,
 0.1111111111111111,
 0.0,
 0.0,
 0.5555555555555556,
 0.6,
 1.0,
 0.3333333333333333,
 0.42857142857142855,
 0.07142857142857142,
 0.0,
 0.0,
 0.5,
 0.16666666666666666,
 0.46153846153846156,
 0.25,
 0.38461538461538464,
 0.16666666666666666,
 0.0,
 0.3333333333333333,
 0.0,
 0.1111111111111111,
 0.2857142857142857],
    "vitalscorestrict": [0.3333333333333333,
 0.0,
 0.5,
 0.25,
 0.8,
 0.36363636363636365,
 0.14285714285714285,
 0.2,
 0.3333333333333333,
 0.0,
 0.42857142857142855,
 0.3333333333333333,
 0.16666666666666666,
 0.75,
 0.5,
 0.6666666666666666,
 0.3076923076923077,
 0.4166666666666667,
 0.0,
 0.5555555555555556,
 0.7142857142857143,
 0.0625,
 0.0,
 0.125,
 0.0,
 0.0,
 0.25,
 0.2727272727272727,
 0.5,
 0.4,
 0.3,
 0.08333333333333333,
 0.2857142857142857,
 0.3,
 0.1111111111111111,
 0.0,
 0.0,
 0.5555555555555556,
 0.4,
 1.0,
 0.3333333333333333,
 0.42857142857142855,
 0.07142857142857142,
 0.0,
 0.0,
 0.5,
 0.16666666666666666,
 0.46153846153846156,
 0.25,
 0.38461538461538464,
 0.16666666666666666,
 0.0,
 0.3333333333333333,
 0.0,
 0.1111111111111111,
 0.0],
    "weightedscore": [0.125,
 0.3333333333333333,
 0.3333333333333333,
 0.18181818181818182,
 0.1111111111111111,
 0.18181818181818182,
 0.16666666666666666,
 0.25,
 0.3333333333333333,
 0.6666666666666666,
 0.15384615384615385,
 0.15384615384615385,
 0.3333333333333333,
 0.25,
 0.2222222222222222,
 0.11764705882352941,
 0.18181818181818182,
 0.2,
 0.5,
 0.11764705882352941,
 0.16666666666666666,
 0.6666666666666666,
 0.16666666666666666,
 0.18181818181818182,
 0.0,
 0.5,
 0.11764705882352941,
 0.2222222222222222,
 0.09090909090909091,
 0.25,
 0.2,
 0.3333333333333333,
 0.07692307692307693,
 0.2,
 0.25,
 0.6666666666666666,
 0.0,
 0.18181818181818182,
 0.3333333333333333,
 0.4,
 0.2222222222222222,
 0.18181818181818182,
 1.0,
 0.0,
 1.0,
 0.11764705882352941,
 0.2,
 0.125,
 0.10526315789473684,
 0.16666666666666666,
 0.14285714285714285,
 0.0,
 0.2857142857142857,
 0.13333333333333333,
 0.2857142857142857,
 0.16666666666666666],
    "weightedscorestrict": [0.18181818181818182,
 0.3333333333333333,
 0.3333333333333333,
 0.18181818181818182,
 0.1111111111111111,
 0.18181818181818182,
 0.16666666666666666,
 0.25,
 0.3333333333333333,
 0.6666666666666666,
 0.15384615384615385,
 0.15384615384615385,
 0.3333333333333333,
 0.25,
 0.2222222222222222,
 0.125,
 0.13333333333333333,
 0.2,
 0.5,
 0.1111111111111111,
 0.16666666666666666,
 0.6666666666666666,
 0.2,
 0.16666666666666666,
 0.0,
 0.4,
 0.10526315789473684,
 0.2222222222222222,
 0.1,
 0.25,
 0.2,
 0.3333333333333333,
 0.09523809523809523,
 0.2,
 0.25,
 0.5,
 0.0,
 0.16666666666666666,
 0.3333333333333333,
 0.4,
 0.2222222222222222,
 0.18181818181818182,
 1.0,
 0.0,
 1.0,
 0.1,
 0.2,
 0.125,
 0.10526315789473684,
 0.16666666666666666,
 0.14285714285714285,
 0.0,
 0.2857142857142857,
 0.15384615384615385,
 0.2857142857142857,
 0.2857142857142857],
}
emb = {
    # "metric_name": [scores per query...],
    "allscore": [0.43333333333333335, 0.1,0.3333333333333333,0.2857142857142857,0.40625,0.36666666666666664,0.3611111111111111,0.2,0.3,0.0,
 0.6666666666666666,
 0.34375,
 0.46153846153846156,
 0.5714285714285714,
 0.75,
 0.7727272727272727,
 0.6153846153846154,
 0.46153846153846156,
 0.19230769230769232,
 0.3,
 0.5,
 0.16666666666666666,
 0.625,
 0.2647058823529412,
 0.0,
 0.2916666666666667,
 0.4,
 0.45454545454545453,
 0.4166666666666667,
 0.34615384615384615,
 0.34615384615384615,
 0.23529411764705882,
 0.47368421052631576,
 0.375,
 0.3125,
 0.3181818181818182,
 0.17857142857142858,
 0.3,
 0.4444444444444444,
 0.16666666666666666,
 0.15625,
 0.36666666666666664,
 0.0625,
 0.20833333333333334,
 0.22727272727272727,
 0.5833333333333334,
 0.36666666666666664,
 0.375,
 0.4444444444444444,
 0.5384615384615384,
 0.25,
 0.0,
 0.35,
 0.34375,
 0.0,
 0.0],
    "allscorestrict": [0.2,
 0.1,
 0.3333333333333333,
 0.14285714285714285,
 0.25,
 0.3333333333333333,
 0.2222222222222222,
 0.2,
 0.3,
 0.0,
 0.3333333333333333,
 0.3125,
 0.6153846153846154,
 0.5714285714285714,
 0.5,
 0.8181818181818182,
 0.5384615384615384,
 0.38461538461538464,
 0.07692307692307693,
 0.2,
 0.4,
 0.1111111111111111,
 0.5,
 0.23529411764705882,
 0.0,
 0.16666666666666666,
 0.35,
 0.36363636363636365,
 0.2777777777777778,
 0.23076923076923078,
 0.15384615384615385,
 0.17647058823529413,
 0.47368421052631576,
 0.25,
 0.3125,
 0.18181818181818182,
 0.07142857142857142,
 0.3,
 0.3333333333333333,
 0.1111111111111111,
 0.0625,
 0.26666666666666666,
 0.0625,
 0.08333333333333333,
 0.18181818181818182,
 0.6666666666666666,
 0.3333333333333333,
 0.3125,
 0.3333333333333333,
 0.46153846153846156,
 0.125,
 0.0,
 0.3,
 0.25,
 0.0,
 0.0],
    "vitalscore": [0.2,
 0.0,
 0.3333333333333333,
 0.16666666666666666,
 0.6,
 0.36363636363636365,
 0.21428571428571427,
 0.2,
 0.3333333333333333,
 0.0,
 0.7142857142857143,
 0.2222222222222222,
 0.6666666666666666,
 0.75,
 0.5,
 0.8333333333333334,
 0.5384615384615384,
 0.4166666666666667,
 0.0,
 0.3333333333333333,
 0.7142857142857143,
 0.125,
 0.3333333333333333,
 0.125,
 0.0,
 0.25,
 0.3333333333333333,
 0.36363636363636365,
 0.16666666666666666,
 0.2,
 0.2,
 0.08333333333333333,
 0.42857142857142855,
 0.3,
 0.2222222222222222,
 0.0,
 0.0,
 0.3333333333333333,
 0.6,
 1.0,
 0.0,
 0.42857142857142855,
 0.07142857142857142,
 0.0,
 0.125,
 0.5,
 0.16666666666666666,
 0.3076923076923077,
 0.25,
 0.46153846153846156,
 0.08333333333333333,
 0.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.0],
    "vitalscorestrict": [0.13333333333333333,
 0.0,
 0.3333333333333333,
 0.16666666666666666,
 0.6,
 0.36363636363636365,
 0.21428571428571427,
 0.2,
 0.3333333333333333,
 0.0,
 0.7142857142857143,
 0.2222222222222222,
 0.5,
 0.75,
 0.5,
 0.8333333333333334,
 0.5384615384615384,
 0.4166666666666667,
 0.0,
 0.3333333333333333,
 0.42857142857142855,
 0.125,
 0.3333333333333333,
 0.125,
 0.0,
 0.25,
 0.3333333333333333,
 0.36363636363636365,
 0.16666666666666666,
 0.3,
 0.2,
 0.08333333333333333,
 0.2857142857142857,
 0.4,
 0.2222222222222222,
 0.0,
 0.0,
 0.3333333333333333,
 0.2,
 1.0,
 0.0,
 0.42857142857142855,
 0.07142857142857142,
 0.0,
 0.125,
 0.3333333333333333,
 0.16666666666666666,
 0.3076923076923077,
 0.25,
 0.46153846153846156,
 0.16666666666666666,
 0.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.0],
    "weightedscore": [0.15384615384615385,
 0.5,
 0.3333333333333333,
 0.25,
 0.15384615384615385,
 0.18181818181818182,
 0.15384615384615385,
 0.25,
 0.3333333333333333,
 0.0,
 0.09523809523809523,
 0.18181818181818182,
 0.18181818181818182,
 0.25,
 0.2222222222222222,
 0.10526315789473684,
 0.11764705882352941,
 0.18181818181818182,
 0.5,
 0.16666666666666666,
 0.2,
 0.3333333333333333,
 0.14285714285714285,
 0.2,
 0.0,
 0.2857142857142857,
 0.125,
 0.2,
 0.2,
 0.2222222222222222,
 0.2222222222222222,
 0.2222222222222222,
 0.1111111111111111,
 0.25,
 0.2,
 0.2857142857142857,
 0.4,
 0.25,
 0.2857142857142857,
 0.5,
 0.4,
 0.18181818181818182,
 1.0,
 0.4,
 0.4,
 0.08,
 0.18181818181818182,
 0.16666666666666666,
 0.125,
 0.14285714285714285,
 0.15384615384615385,
 0.0,
 0.25,
 0.15384615384615385,
 0.0,
 0.0],
    "weightedscorestrict": [0.16666666666666666,
 0.5,
 0.3333333333333333,
 0.25,
 0.14285714285714285,
 0.18181818181818182,
 0.15384615384615385,
 0.25,
 0.3333333333333333,
 0.0,
 0.10526315789473684,
 0.18181818181818182,
 0.15384615384615385,
 0.25,
 0.2222222222222222,
 0.10526315789473684,
 0.11764705882352941,
 0.16666666666666666,
 0.5,
 0.16666666666666666,
 0.2222222222222222,
 0.3333333333333333,
 0.125,
 0.2,
 0.0,
 0.2857142857142857,
 0.13333333333333333,
 0.2,
 0.2,
 0.25,
 0.2222222222222222,
 0.2857142857142857,
 0.1111111111111111,
 0.2222222222222222,
 0.2,
 0.3333333333333333,
 0.4,
 0.3333333333333333,
 0.3333333333333333,
 0.5,
 0.4,
 0.18181818181818182,
 1.0,
 0.5,
 0.4,
 0.08,
 0.18181818181818182,
 0.16666666666666666,
 0.125,
 0.14285714285714285,
 0.15384615384615385,
 0.0,
 0.25,
 0.2,
 0.0,
 0.0],
}

# Parametri
alpha = 0.05        # livello di significatività per Holm
alternative = "two-sided"  # "two-sided" | "greater" | "less"
# ============================================================================

import numpy as np
import pandas as pd
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import multipletests

# controlli base
if set(base.keys()) != set(emb.keys()):
    missing_in_emb = set(base) - set(emb)
    missing_in_base = set(emb) - set(base)
    raise ValueError(f"Metriche non allineate.\n"
                     f"Mancano in emb: {sorted(missing_in_emb)}\n"
                     f"Mancano in base: {sorted(missing_in_base)}")

rows = []
for metric in sorted(base.keys()):
    a = np.asarray(base[metric], dtype=float)
    b = np.asarray(emb[metric], dtype=float)
    if a.shape != b.shape:
        raise ValueError(f"Lista con lunghezze diverse per '{metric}': {a.shape} vs {b.shape}")

    # rimuovi coppie con NaN (se presenti)
    mask = ~(np.isnan(a) | np.isnan(b))
    a_clean, b_clean = a[mask], b[mask]
    if a_clean.size < 2:
        rows.append({
            "metric": metric, "n": int(a_clean.size),
            "t_stat": np.nan, "p_value": np.nan,
            "mean_base": np.nan, "mean_emb": np.nan, "delta": np.nan
        })
        continue

    # t-test appaiato
    t_res = ttest_rel(a_clean, b_clean, alternative=alternative)
    rows.append({
        "metric": metric,
        "n": int(a_clean.size),
        "t_stat": float(t_res.statistic),
        "p_value": float(t_res.pvalue),
        "mean_base": float(np.mean(a_clean)),
        "mean_emb": float(np.mean(b_clean)),
        "delta": float(np.mean(b_clean) - np.mean(a_clean))  # positivo = emb > base
    })

df = pd.DataFrame(rows)

# Correzione di Holm sui p-value
pvals = df["p_value"].to_numpy()
# multipletests gestisce NaN: li preserviamo e ricalcoliamo solo sui validi
valid = ~np.isnan(pvals)
p_adj = np.full_like(pvals, np.nan, dtype=float)
reject = np.full(pvals.shape, False, dtype=bool)
if valid.any():
    rej_v, p_adj_v, _, _ = multipletests(pvals[valid], alpha=alpha, method="holm")
    p_adj[valid] = p_adj_v
    reject[valid] = rej_v

df["p_holm"] = p_adj
df["reject_holm@{:.2f}".format(alpha)] = reject

# Ordina per p_holm crescente (NaN in fondo)
df = df.sort_values(by=["p_holm", "p_value"], na_position="last").reset_index(drop=True)
df


Unnamed: 0,metric,n,t_stat,p_value,mean_base,mean_emb,delta,p_holm,reject_holm@0.05
0,weightedscore,56,1.12939,0.263635,0.250911,0.224723,-0.026188,1.0,False
1,weightedscorestrict,56,0.74658,0.458496,0.248812,0.231324,-0.017488,1.0,False
2,allscore,56,0.66539,0.508582,0.346946,0.335899,-0.011047,1.0,False
3,vitalscorestrict,56,0.614893,0.541161,0.278854,0.266915,-0.011938,1.0,False
4,vitalscore,56,0.478284,0.634344,0.292841,0.283795,-0.009047,1.0,False
5,allscorestrict,56,-0.274921,0.784407,0.260572,0.265557,0.004985,1.0,False
