### [Evaluating RAG Pipelines](https://haystack.deepset.ai/tutorials/35_evaluating_rag_pipelines)

In [1]:
#! Solo es para haystack sepa que tutorial se esta ejecutando
from haystack.telemetry import tutorial_running
tutorial_running(35)

### 1. Crear el pipeline RAG para evaluar


In [2]:
from datasets import load_dataset
from haystack import Document

dataset = load_dataset("vblagoje/PubMedQA_instruction", split="train")
dataset = dataset.select(range(1000))

all_documents = [Document(content=doc["context"]) for doc in dataset]
all_questions = [doc["instruction"] for doc in dataset]
all_ground_truth_answers = [doc["response"] for doc in dataset]

README.md:   0%|          | 0.00/498 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/986k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/272458 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Crear el pipeline para indexar datos de forma sensilla

from typing import List
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

document_store = InMemoryDocumentStore()

document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)

indexing = Pipeline()
indexing.add_component(instance=document_embedder, name="document_embedder")
indexing.add_component(instance=document_writer, name="document_writer")

indexing.connect("document_embedder.documents", "document_writer.documents")

indexing.run({"document_embedder": {"documents": all_documents}})



Batches:   0%|          | 0/32 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 1000}}

In [5]:
# Ahora con los datos listos, crear un RAG simple
import os
from getpass import getpass
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

template = """
        You have to answer the following question based on the given context information only.

        Context:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        Question: {{question}}
        Answer:
        """

rag_pipeline = Pipeline()
rag_pipeline.add_component(
    "query_embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
)
rag_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=3))
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
rag_pipeline.add_component("generator", OpenAIGenerator(model="gpt-4o-mini"))
rag_pipeline.add_component("answer_builder", AnswerBuilder())

rag_pipeline.connect("query_embedder", "retriever.query_embedding")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "generator")
rag_pipeline.connect("generator.replies", "answer_builder.replies")
rag_pipeline.connect("generator.meta", "answer_builder.meta")
rag_pipeline.connect("retriever", "answer_builder.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7ff7c863c6d0>
🚅 Components
  - query_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - generator: OpenAIGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - query_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - retriever.documents -> answer_builder.documents (List[Document])
  - prompt_builder.prompt -> generator.prompt (str)
  - generator.replies -> answer_builder.replies (List[str])
  - generator.meta -> answer_builder.meta (List[Dict[str, Any]])

In [6]:
question = "Do high levels of procalcitonin in the early phase after pediatric liver transplantation indicate poor postoperative outcome?"

response = rag_pipeline.run(
    {
        "query_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "answer_builder": {"query": question},
    }
)
print(response["answer_builder"]["answers"][0].data)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Yes, high levels of procalcitonin (PCT) in the early phase after pediatric liver transplantation are associated with poor postoperative outcomes. Patients with elevated PCT levels on postoperative day 2 had higher International Normalized Ratio values on postoperative day 5, experienced more primary graft non-function, had longer stays in the pediatric intensive care unit, and required more time on mechanical ventilation.


### 2. Evaluar el pipeline

In [7]:
import random

questions, ground_truth_answers, ground_truth_docs = zip(
    *random.sample(list(zip(all_questions, all_ground_truth_answers, all_documents)), 25)
)


In [None]:
# A continuación, ejecutemos nuestro pipeline y asegurémonos de realizar un seguimiento de lo que nuestro pipeline devuelve como respuestas y qué documentos recupera:
rag_answers = []
retrieved_docs = []

for question in list(questions):
    response = rag_pipeline.run(
        {
            "query_embedder": {"text": question},
            "prompt_builder": {"question": question},
            "answer_builder": {"query": question},
        }
    )
    print(f"Question: {question}")
    print("Answer from pipeline:")
    print(response["answer_builder"]["answers"][0].data)
    print("\n-----------------------------------\n")

    rag_answers.append(response["answer_builder"]["answers"][0].data)
    retrieved_docs.append(response["answer_builder"]["answers"][0].documents)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does ascorbic Acid ameliorate nicotine exposure induced impaired spatial memory performances in rats?
Answer from pipeline:
Yes, ascorbic acid ameliorates nicotine exposure induced impaired spatial memory performances in rats.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do [ Recovery of maxillary tooth sensibility after Le Fort I osteotomy ]?
Answer from pipeline:
Yes, the recovery of maxillary tooth sensitivity after Le Fort I osteotomy was analyzed in the study. It was observed that sensitivity recovery was faster in younger patients (under 35 years of age) and for upper middle and superior alveolar nerves. Sensitivity percentages were recorded at different intervals: 91.9% were sensitive pre-surgery (D-1), which dropped to 12.7% at D2, and gradually increased to 61.8% by M6.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do apitherapy products enhance the recovery of CCL4-induced hepatic damages in rats?
Answer from pipeline:
Yes, apitherapy products, such as honeybee products from Turkey (including chestnut honey, pollen, propolis, and royal jelly), have been shown to enhance recovery from CCl4-induced hepatic damage in rats. The study monitored the development of liver damage and oxidative stress, and despite varying levels of antioxidant capacity among the bee products, they all played a significant role in the prevention of liver damage induced by CCl4.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Are glutathione s-transferase m1 and t1 gene polymorphisms associated with increased risk of gestational diabetes mellitus development?
Answer from pipeline:
No, glutathione S-transferase M1 (GSTM1) and T1 (GSTT1) gene polymorphisms are not associated with an increased risk of gestational diabetes mellitus development, as the study found no statistically significant differences in the frequencies of GSTM1 and GSTT1 null genotypes between women with gestational diabetes mellitus and control individuals without it.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Is bone marrow from blotchy mice dispensable to regulate blood copper and aortic pathologies but required for inflammatory mediator production in LDLR-deficient mice during chronic angiotensin II infusion?
Answer from pipeline:
Yes, bone marrow from blotchy mice is dispensable to regulate blood copper and aortic pathologies, as there was no significant difference in copper concentrations or aortic diameters between the blotchy marrow group and the control marrow group. However, it is required for inflammatory mediator production, as the blotchy marrow group showed significantly reduced levels of various inflammatory mediators compared to the control group.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do the effects of ifenprodil on the activity of antidepressant drugs in the forced swim test in mice?
Answer from pipeline:
Ifenprodil potentiated the antidepressant-like effect of imipramine (15mg/kg) and fluoxetine (5mg/kg) in the forced swim test in mice. However, it did not reduce the immobility time of animals that simultaneously received reboxetine (2.5mg/kg) or tianeptine (15mg/kg).

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does concern for Lost Income Following Donation deter Some Patients From Talking to Potential Living Donors?
Answer from pipeline:
Yes, concern for lost income following donation does deter some patients from talking to potential living donors. The context indicates that many patients (42%) chose not to discuss living kidney donation with a family member or friend due to concerns about the impact of lost income on the donor. Additionally, one-third of patients reported that potential donors expressed concern about lost income, with a significant portion of those (64%) not initiating donation evaluation.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Is pseudomonas aeruginosa in CF and non-CF homes found predominantly in drains?
Answer from pipeline:
Yes, Pseudomonas aeruginosa is found predominantly in drains in both CF and non-CF homes, as 28% of sampled drains yielded the bacteria at least once, and a general mixed linear model estimated that 6.3% of samples from drains yield P. aeruginosa, which is significantly higher than that from any other type of household environment.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do obese patients with idiopathic pulmonary fibrosis have a higher 90-day mortality risk with bilateral lung transplantation?
Answer from pipeline:
Yes, obese patients with idiopathic pulmonary fibrosis (IPF) who receive bilateral lung transplantation (BLT) are 1.71 times more likely to die within 90 days compared to BLT recipients with a body mass index (BMI) of 18.5 to 30 kg/m(2). This indicates a higher 90-day mortality risk for obese patients after BLT.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does introduction of an 8-aminooctanoic acid linker enhance uptake of 99mTc-labeled lactam bridge-cyclized α-MSH peptide in melanoma?
Answer from pipeline:
Yes, the introduction of an 8-aminooctanoic acid (Aoc) linker enhances the uptake of the 99mTc-labeled lactam bridge-cyclized α-MSH peptide in melanoma, as indicated by the high melanoma uptake observed with the peptide HYNIC-AocNle-CycMSHhex. This peptide displayed the highest melanoma uptake (22.3 ± 1.72 percentage injected dose/g) at 2 hours after injection compared to the other peptides tested.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does a HIF-1 network reveal characteristics of epithelial-mesenchymal transition in acute promyelocytic leukemia?
Answer from pipeline:
Yes, a HIF-1 network reveals characteristics of epithelial-mesenchymal transition (EMT) in acute promyelocytic leukemia (APL). The research indicates that genes within the HIF-1-dependent subnetwork specifically dysregulated in APL are involved in cell motility and invasion, with decreased expression of genes related to cell adhesion and increased expression of genes implicated in motility and invasion, which are characteristic features of EMT.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Is esophageal Mucosal Impedance Pattern Distinct in Patients With Extraesophageal Reflux Symptoms and Pathologic Acid Reflux?
Answer from pipeline:
Yes, the esophageal Mucosal Impedance (MI) pattern is distinct in patients with extraesophageal reflux (EER) symptoms and pathologic acid reflux. The study found significant differences in MI measurements among patients with erosive esophagitis (E+), those with normal EGD but abnormal pH (E-/pH+), and those with normal EGD and normal pH (E-/pH-). Specifically, MI was lowest for those with erosive esophagitis and greatest for those without reflux symptoms (E-/pH-). There was a trend of lower MI in patients with reflux (E+ and E-/pH+) compared to those without, suggesting that MI patterns can help differentiate between groups based on reflux status and provide insights into the presence of extraesophageal reflux symptoms.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does changes in lung sound during asthma progression in a guinea pig model?
Answer from pipeline:
Yes, changes in lung sound during asthma progression in a guinea pig model have been observed. Specifically, the study noted that the difference in inspiratory sound intensity between asthma models and controls was significant, with a correlation found between the ratio of airway wall thickness to total airway area in the terminal bronchioles and changes in inspiratory sound intensity in the 501-1000-Hz range.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does endothelial Jag1-RBPJ signalling promote inflammatory leucocyte recruitment and atherosclerosis?
Answer from pipeline:
The provided context does not specifically address the role of endothelial Jag1-RBPJ signaling in promoting inflammatory leukocyte recruitment and atherosclerosis. It discusses the role of NOTCH and RBPJ in arterial injury response and chronic arterial-wall inflammation but does not provide conclusive evidence regarding the direct impact of Jag1-RBPJ signaling on inflammatory leukocyte recruitment or the progression of atherosclerosis. Therefore, based on the given context, it cannot be determined whether endothelial Jag1-RBPJ signaling promotes these processes.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do identification of UQCRB as an oxymatrine recognizing protein using a T7 phage display screen?
Answer from pipeline:
Yes, Ubiquinol-cytochrome c reductase binding protein (UQCRB) was identified as a candidate binding protein of oxymatrine using a T7 phage display cDNA library screened by affinity selection with oxymatrine as bait.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do sLCO1B1 gene polymorphisms influence plasma rifampicin concentrations in a South Indian population?
Answer from pipeline:
No, SLCO1B1 gene polymorphisms do not influence plasma rifampicin concentrations in the South Indian population, as the study found no significant differences in RMP concentrations between the different genotypes of the three polymorphisms.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does stochastic modeling suggest that noise reduces differentiation efficiency by inducing a heterogeneous drug response in glioma differentiation therapy?
Answer from pipeline:
Yes, stochastic modeling suggests that noise reduces differentiation efficiency by inducing a heterogeneous drug response in glioma differentiation therapy. The study found that noise renders some glioma cells insensitive to cyclin D1 degradation during drug treatment and induces varied differentiation responses among individual glioma cells, modulating the ultrasensitive response of cyclin D1. This ultimately leads to decreased evolution of differentiation potential in the drug-treated glioma cell population.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does identical ATP1A3 mutation cause alternating hemiplegia of childhood and rapid-onset dystonia parkinsonism phenotypes?
Answer from pipeline:
Yes, the identical ATP1A3 mutation can cause both alternating hemiplegia of childhood and rapid-onset dystonia parkinsonism phenotypes, as seen in the case of a child with alternating hemiplegia who was found to have the same ATP1A3 mutation as adults with rapid-onset dystonia parkinsonism.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does surveillance provide insight into epidemiology and spectrum of culture-confirmed mycobacterial disease in children?
Answer from pipeline:
Yes, surveillance provides insight into the epidemiology and spectrum of culture-confirmed mycobacterial disease in children. In the context, a surveillance study conducted at the Tygerberg Children's Hospital in Cape Town, South Africa, gathered data on mycobacterial culture among children aged <13 years from March 2011 to February 2013. This study not only detailed the prevalence of drug resistance among children with culture-confirmed tuberculosis but also documented other mycobacterial isolates identified, thereby contributing to the understanding of the epidemiological landscape of mycobacterial diseases in this population.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does glioma-derived versican promote tumor expansion via glioma-associated microglial/macrophages Toll-like receptor 2 signaling?
Answer from pipeline:
Yes, glioma-derived versican promotes tumor expansion via glioma-associated microglial/macrophages Toll-like receptor 2 (TLR2) signaling. The context states that versican, particularly its splice variants, activates TLR2 in microglia, leading to the upregulation of membrane type 1 matrix metalloprotease (MT1-MMP) which promotes the degradation of extracellular matrix, thereby facilitating tumor invasion and growth. When versican was silenced in glioma cells, it resulted in decreased MT1-MMP expression in microglia, smaller tumors, and longer survival rates, indicating that versican's signaling through TLR2 is crucial for tumor expansion.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Do systematic Reviews Published in Emergency Medicine Journals Routinely Search Clinical Trials Registries : A Cross-Sectional Analysis?
Answer from pipeline:
No, systematic reviews published in emergency medicine journals do not routinely search clinical trials registries. In a study of 41 systematic reviews that assessed specific interventions, only 20% (8 reviews) included a search of a clinical trials registry, and only 4 of those identified relevant unpublished studies.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Is p63 more sensitive and specific than 34βE12 to differentiate adenocarcinoma of prostate from cancer mimickers?
Answer from pipeline:
The context does not provide a direct comparison of the sensitivity and specificity of p63 and 34βE12 in differentiating adenocarcinoma of the prostate from cancer mimickers. It states that both p63 and 34βE12 were negative in all prostate adenocarcinoma specimens, while all benign hyperplasia and high-grade intraepithelial neoplasia cases expressed them. However, it does not specify which marker is more sensitive or specific, nor does it imply a comparison between the two. Therefore, based on the provided context, it cannot be concluded whether p63 is more sensitive and specific than 34βE12 for this purpose.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does first registry result from the newly approved ACURATE TA™ TAVI system†?
Answer from pipeline:
The first registry results from the newly approved ACURATE TA™ TAVI system indicate a procedural success rate of 98% with 245 out of 250 patients experiencing successful implantation. The 30-day mortality rate was 6.8%, and the post-implant echocardiography showed a relevant paravalvular leak (moderate 2+) in 2.3% of patients. Additionally, the mean transvalvular gradient post-implantation was 12.4 ± 5.8 mmHg, and 85% of patients followed up presented in NYHA class I/II.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Does qS-21 enhance the early antibody response to oil adjuvant foot-and-mouth disease vaccine in cattle?
Answer from pipeline:
Yes, the addition of QS-21 to the commercially available foot-and-mouth disease water-in-oil-in-water emulsion vaccine showed a significant early antibody increase in the QS-21 group after vaccination in cattle.

-----------------------------------



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: Are lobar microbleeds associated with a decline in executive functioning in older adults?
Answer from pipeline:
Yes, lobar microbleeds are associated with a decline in executive functioning in older adults. The study found that participants with 2 or more lobar microbleeds had worse executive functioning at the visit closest to the MRI scan and exhibited a faster decline in executive function over time compared to those with fewer than 2 lobar microbleeds.

-----------------------------------



In [9]:
# Si bien cada evaluador es un componente que se puede ejecutar de forma individual en Haystack, también se pueden agregar a un pipeline. De esta manera, podemos construir un pipeline eval_pipelineque incluya todos los evaluadores para las métricas que queremos evaluar.

from haystack.components.evaluators.document_mrr import DocumentMRREvaluator
from haystack.components.evaluators.faithfulness import FaithfulnessEvaluator
from haystack.components.evaluators.sas_evaluator import SASEvaluator

eval_pipeline = Pipeline()
eval_pipeline.add_component("doc_mrr_evaluator", DocumentMRREvaluator())
eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator())
eval_pipeline.add_component("sas_evaluator", SASEvaluator(model="sentence-transformers/all-MiniLM-L6-v2"))

results = eval_pipeline.run(
    {
        "doc_mrr_evaluator": {
            "ground_truth_documents": list([d] for d in ground_truth_docs),
            "retrieved_documents": retrieved_docs,
        },
        "faithfulness": {
            "questions": list(questions),
            "contexts": list([d.content] for d in ground_truth_docs),
            "predicted_answers": rag_answers,
        },
        "sas_evaluator": {"predicted_answers": rag_answers, "ground_truth_answers": list(ground_truth_answers)},
    }
)


100%|██████████| 25/25 [00:53<00:00,  2.14s/it]


In [10]:
# Una vez que hayamos ejecutado nuestro proceso de evaluación, también podemos crear un informe de evaluación completo. Haystac proporciona un EvaluationRunResultque podemos usar para mostrar un score_report👇

from haystack.evaluation.eval_run_result import EvaluationRunResult

inputs = {
    "question": list(questions),
    "contexts": list([d.content] for d in ground_truth_docs),
    "answer": list(ground_truth_answers),
    "predicted_answer": rag_answers,
}

evaluation_result = EvaluationRunResult(run_name="pubmed_rag_pipeline", inputs=inputs, results=results)
evaluation_result.score_report()

Unnamed: 0,metrics,score
0,doc_mrr_evaluator,0.96
1,faithfulness,0.976667
2,sas_evaluator,0.691143


In [11]:
# Convertir el informe en un Pandas DataFrame
results_df = evaluation_result.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,predicted_answer,doc_mrr_evaluator,faithfulness,sas_evaluator
0,Does ascorbic Acid ameliorate nicotine exposur...,[The long lasting behavioural and cognitive im...,Ascorbic acid supplementation was found to be ...,"Yes, ascorbic acid ameliorates nicotine exposu...",1.0,1.0,0.761964
1,Do [ Recovery of maxillary tooth sensibility a...,"[Upper alveolar nerves, when injured during Le...",Among the teeth that were sensitive before sur...,"Yes, the recovery of maxillary tooth sensitivi...",1.0,1.0,0.775067
2,Do apitherapy products enhance the recovery of...,[Our objective was to identify the antioxidant...,"Our results suggest that honey, propolis, poll...","Yes, apitherapy products, such as honeybee pro...",0.5,1.0,0.829854
3,Are glutathione s-transferase m1 and t1 gene p...,[The aim of this study was to investigate whet...,This study shows no association between GST ge...,"No, glutathione S-transferase M1 (GSTM1) and T...",1.0,1.0,0.599473
4,Is bone marrow from blotchy mice dispensable t...,[The blotchy mouse caused by mutations of ATP7...,"Overall, our study indicated that the effect o...","Yes, bone marrow from blotchy mice is dispensa...",1.0,1.0,0.690724
5,Do the effects of ifenprodil on the activity o...,"[According to reports in the literature, more ...",The concomitant administration of certain comm...,Ifenprodil potentiated the antidepressant-like...,1.0,1.0,0.537249
6,Does concern for Lost Income Following Donatio...,[Some living kidney donors report lost income ...,Findings from the current study underscore how...,"Yes, concern for lost income following donatio...",1.0,1.0,0.701326
7,Is pseudomonas aeruginosa in CF and non-CF hom...,[For patients with cystic fibrosis (CF) Pseudo...,These findings implicate drains as important p...,"Yes, Pseudomonas aeruginosa is found predomina...",1.0,1.0,0.662766
8,Do obese patients with idiopathic pulmonary fi...,[Obese patients with idiopathic pulmonary fibr...,Our results suggest that obese patients who re...,"Yes, obese patients with idiopathic pulmonary ...",1.0,1.0,0.576158
9,Does introduction of an 8-aminooctanoic acid l...,[The purpose of this study was to examine the ...,High melanoma uptake and fast urinary clearanc...,"Yes, the introduction of an 8-aminooctanoic ac...",1.0,1.0,0.573848


In [12]:
import pandas as pd

top_3 = results_df.nlargest(3, "sas_evaluator")
bottom_3 = results_df.nsmallest(3, "sas_evaluator")
pd.concat([top_3, bottom_3])


Unnamed: 0,question,contexts,answer,predicted_answer,doc_mrr_evaluator,faithfulness,sas_evaluator
20,Do systematic Reviews Published in Emergency M...,[Publication bias compromises the validity of ...,Systematic reviews published in emergency medi...,"No, systematic reviews published in emergency ...",1.0,1.0,0.935371
19,Does glioma-derived versican promote tumor exp...,[Accumulation and infiltration of microglia/br...,Our results show that versican released from g...,"Yes, glioma-derived versican promotes tumor ex...",1.0,1.0,0.913509
2,Do apitherapy products enhance the recovery of...,[Our objective was to identify the antioxidant...,"Our results suggest that honey, propolis, poll...","Yes, apitherapy products, such as honeybee pro...",0.5,1.0,0.829854
18,Does surveillance provide insight into epidemi...,[Longer-term tuberculosis (TB) drug resistance...,There has been a significant reduction in bact...,"Yes, surveillance provides insight into the ep...",0.5,1.0,0.384376
5,Do the effects of ifenprodil on the activity o...,"[According to reports in the literature, more ...",The concomitant administration of certain comm...,Ifenprodil potentiated the antidepressant-like...,1.0,1.0,0.537249
21,Is p63 more sensitive and specific than 34βE12...,[Prostate cancer is the world's leading cause ...,Basal cell markers can help to distinguish pro...,The context does not provide a direct comparis...,1.0,0.666667,0.544149
