<a href="https://colab.research.google.com/github/LucasVD23/Medical_QA_with_LLM/blob/main/medqa_with_multiagents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import json
import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Defining important funcions and classes

## RAG


In [2]:

def get_textbooks(dataset_path):
    textbooks = []
    for book in os.listdir(dataset_path + 'textbooks/en/'):
        file = open(dataset_path + 'textbooks/en/' + book, 'r')
        textbooks.append('\n'.join(file.readlines()))

    return textbooks


In [3]:
class RAG():
    def __init__(self, model_name, vector_store_path) -> None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model_kwargs = {'device': device}
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs)
        self.vector_store_path = vector_store_path


    def create_vectorstore(self,knowledge_base_path, text_splitter = None):

        if text_splitter is None:
          text_splitter = SemanticChunker(self.embedding_model)
        textbooks = get_textbooks(knowledge_base_path)
        chunks = text_splitter.create_documents(textbooks)

        faiss_store  = FAISS.from_documents(chunks, embedding=self.embedding_model)
        faiss_store.save_local(self.vector_store_path)

    def retrieve_documents(self, query, num_docs = 5):
        faiss_store = FAISS.load_local(self.vector_store_path,
                                self.embedding_model,
                                allow_dangerous_deserialization = True)

        similar_docs = faiss_store.similarity_search(query, k=num_docs)
        similar_docs = [doc.page_content for doc in similar_docs]

        return similar_docs

## OpenAI API and RAGAS Metrics


In [65]:
from abc import ABC, abstractmethod
import re
from openai import OpenAI


client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

class GPT():
  def __init__(self, model = 'gpt-4o-mini'):

    self.client = OpenAI()
    self.model = model

  def get_answer(self, prompt):
    response =  client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": prompt
        }
        ]
    )
    return response.choices[0].message.content.strip()
gpt4o = GPT()

#Classe pai que as métricas irão herdar
class Metric(ABC):

    def __init__(self, model):
        self.model = model

    #Obriga a implementação do evaluate
    @abstractmethod
    def evaluate(self, question, context, answer):
        raise NotImplementedError

class Faithulness(Metric):
    def __init__(self, model):  # Add model as an argument
        super().__init__(model)  # Pass the model to the parent class

    def __process_statements(self, statements):
        """
        Função para extrair afirmações da LLM
        """
        #Expressão regular para capturar statemnts
        pattern = r"Statement \d+:\s*(.+)"

        matches = re.findall(pattern, statements)

        #Caso a resposta não esteja de acordo com o formato especificado
        if not matches:
          print(f"No statements found in the response: {statements}")

        return matches

    def __construct_verdict_prompt(self, statement_list, context):
        prompt = (
            f"Consider the given context and following\n"
            f"statements, then determine whether they\n"
            f"are supported by the information present\n"
            f"in the context. Provide a brief explanation for each statement before arriving\n"
            f"at the verdict (Yes/No). Provide a final\n"
            f"verdict for each statement in order at the\n"
            f"end in the given format. Do not deviate\n"
            f"from the specified format.\n\n"
            f"Context: {context}\n\n"
        )

        #Adiciona as afirmações no prompt
        for i, statement in enumerate(statement_list, 1):
            prompt += f"Statement {i}: {statement}\n"
        #Adiciona o formato de veredito
        prompt += "\nFinal verdict format:\n"
        for i in range(1, len(statement_list) + 1):
            prompt += f"Verdict {i}: Yes/No\n"

        return prompt

    def __process_verdicts(self, verdicts):
        """
        Função para processar as respostas com vereditos
        """
        #Expressão regular para capturar vereditos
        verdict_pattern = re.findall(r"Verdict \d+:\s*(Yes|No)", verdicts)

        #Converte vereditos em 1 e 0
        binary_verdicts = [1 if v == 'Yes' else 0 for v in verdict_pattern]

        if not binary_verdicts:
            print(f"No verdicts found in the response: {verdicts}")
            return 0

        return sum(binary_verdicts) / len(binary_verdicts)

    def evaluate(self, question, answer, context):
        #Primeiro prompt para pegar as afirmações
        get_statements_prompt = f"""
        Given a question and answer, create one
        or more statements from each sentence
        in the given answer.

        Return the response in the following format:

        Statement 1: statement_1
        ...
        Statement n: statement_n

        Do not deviate from the specified format.

        question: {question}
        answer: {answer}
        """
        #Uso do modelo para capturar os statements
        statements = self.model.get_answer(get_statements_prompt)
        #Processametno das saidas
        processed_statements = self.__process_statements(statements)

        #Construir o segundo prompt
        faithfulness_prompt = self.__construct_verdict_prompt(processed_statements, context)
        #Vereditos da LLM
        verdicts = self.model.get_answer(faithfulness_prompt)

        average_score = self.__process_verdicts(verdicts)

        return average_score



class AnswerRelevance(Metric):
    def __init__(self, model):
        super().__init__(model)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

    def evaluate(self, question, answer, n=5):

        # Gera N questões a partir da resposta
        generated_questions = self.__generate_questions_from_answer(answer, n)

        #Calcula os emebeddings
        original_question_embedding = self.embedding_model.encode(question, convert_to_tensor=True)
        generated_question_embeddings = self.embedding_model.encode(generated_questions, convert_to_tensor=True)

        #Calcula similaridade por cosseno
        similarities = self.__calculate_similarities__(original_question_embedding, generated_question_embeddings)

        #Média das similaridades
        average_relevance_score = torch.mean(torch.tensor(similarities)).item()

        return average_relevance_score

    def __generate_questions_from_answer(self, answer, n):
        generated_questions = []
        for _ in range(n):
            #Prompt de geração das perguntas
            generate_question_prompt = f"Generate a question for the given answer.\nanswer: {answer}"
            question = self.model.get_answer(generate_question_prompt)
            generated_questions.append(question.strip())

        return generated_questions

    def __calculate_similarities__(self, original_embedding, generated_embeddings):

        original_embedding = original_embedding.unsqueeze(0)

        similarities = F.cosine_similarity(generated_embeddings, original_embedding, dim=1).tolist()

        return similarities

class ContextRelevance(Metric):
    def __init__(self, model):
        super().__init__(model)

    def evaluate(self, question, context):
        # Calculate total sentences across all documents in the context
        total_sentences = sum(len(self.__split_into_sentences(doc)) for doc in context)

        # Extract relevant sentences from the context
        extracted_sentences = self.__extract_relevant_sentences(question, context)

        # Calculate relevance score as the ratio of relevant sentences to total sentences
        if "Insufficient Information" in extracted_sentences:
            relevance_score = 0.0
        else:
            relevance_score = len(extracted_sentences) / total_sentences

        return relevance_score, extracted_sentences

    def __extract_relevant_sentences(self, question, context):
        # Join all documents into a single text for extraction
        context_text = " ".join(context)

        # Prompt for sentence extraction
        extract_prompt = f"""
        Please extract relevant sentences from the provided context that can potentially help answer the following question.
        If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information".
        While extracting candidate sentences, you’re not allowed to make any changes to sentences from the given context.

        question: {question}
        context: {context_text}
        """

        extracted_sentences = self.model.get_answer(extract_prompt)

        if "Insufficient Information" in extracted_sentences:
            return ["Insufficient Information"]
        else:
            # Process extracted sentences into a list
            return self.__process_extracted_sentences(extracted_sentences)

    def __process_extracted_sentences(self, extracted_sentences):
        # Split extracted text into sentences, stripping whitespace
        list_of_sentences = [sentence.strip() for sentence in extracted_sentences.split('. ') if sentence]

        return list_of_sentences

    def __split_into_sentences(self, document):
        return [sentence.strip() for sentence in document.split('. ') if sentence]




## Multiagents

In [None]:
!pip install -U langgraph

Collecting langgraph
  Downloading langgraph-0.2.45-py3-none-any.whl.metadata (15 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.0.0 (from langgraph)
  Downloading langgraph_checkpoint-2.0.2-py3-none-any.whl.metadata (4.6 kB)
Collecting langgraph-sdk<0.2.0,>=0.1.32 (from langgraph)
  Downloading langgraph_sdk-0.1.35-py3-none-any.whl.metadata (1.8 kB)
Downloading langgraph-0.2.45-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.3/119.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langgraph_checkpoint-2.0.2-py3-none-any.whl (23 kB)
Downloading langgraph_sdk-0.1.35-py3-none-any.whl (28 kB)
Installing collected packages: langgraph-sdk, langgraph-checkpoint, langgraph
Successfully installed langgraph-0.2.45 langgraph-checkpoint-2.0.2 langgraph-sdk-0.1.35


In [6]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from typing import TypedDict, Optional
from langgraph.graph import StateGraph, END


In [13]:
from typing import TypedDict, Optional

class GraphState(TypedDict):
    original_question: Optional[str] =  None,
    options: Optional[str] = None,
    rephrased_question: Optional[str] =  None,
    rag_information: Optional[str] = None,
    cot_output: Optional[str] = None,
    faith_score: Optional[float] = None

In [8]:
def rephraser_agent(state):
    llm_caller = GPT()

    query_prompt = """Rephrase the question using medical terminology to abstract the patient's specific symptoms and conditions
Question: {}"""

    original_question = state.get('original_question', '').strip()
    rephrased_question = state.get('rephrased_question', '').strip()
    options = state.get('options', '').strip()
    rag_information = state.get('rag_information', '').strip()
    cot_output = state.get('cot_output', '')
    faith_score = state.get('faith_score', '')

    rephrased_question = llm_caller.get_answer(query_prompt.format(original_question))


    return {
            'original_question' : original_question,
            'rephrased_question' : rephrased_question,
            'options': options,
            'rag_information' : rag_information,
            'cot_output' : cot_output,
            'faith_score' : faith_score
    }

In [9]:
def cot_agent(state):
    llm_caller = GPT()

    query_prompt = """Your role is to be a medical assistant for performing Q&A
in the medical domain. Given the following question and options, return the
correct option thinking step-by-step on how to get to the final answer.
Return the answer in the following format:
Answer: (option from A to E)"""


    original_question = state.get('original_question', '').strip()
    rephrased_question = state.get('rephrased_question', '').strip()
    options = state.get('options', '').strip()
    rag_information = state.get('rag_information', '').strip()
    cot_output = state.get('cot_output', '')
    faith_score = state.get('faith_score', '')


    question = rephrased_question if rephrased_question != '' else original_question

    if rag_information != ''.strip():
      query_prompt += """\nUse the following context to answer the question: {}
Question: {}
Options:{}
      """.format(rag_information, question, options)
    else:
      query_prompt += """
Question: {}
Options:{}
      """.format(question, options)

    cot_output = llm_caller.get_answer(query_prompt)

    return {
            'original_question' : original_question,
            'rephrased_question' : rephrased_question,
            'options': options,
            'rag_information' : rag_information,
            'cot_output' : cot_output,
            'faith_score' : faith_score
    }



In [10]:
def rag_agent(state):
    llm_caller = GPT()

    relevant_prompt = """Given the following question and documents, extract only the documents that are relevant to the question
Question: {}
Documents: {}
    """

    useful_prompt = """Given the follow extract only the segments that are useful to the question
Question: {}
Documents: {}
"""


    original_question = state.get('original_question', '').strip()
    rephrased_question = state.get('rephrased_question', '').strip()
    options = state.get('options', '').strip()
    rag_information = state.get('rag_information', '').strip()
    cot_output = state.get('cot_output', '')
    answer_idx = state.get('answer_idx', '')
    faith_score = state.get('faith_score', '')

    rag_docs = rag_module.retrieve_documents(original_question, num_docs = 5)

    rag_information = "\n".join(str(item) for item in rag_docs)

    relevant = llm_caller.get_answer(relevant_prompt.format(original_question,
                                                                   rag_information))

    context = llm_caller.get_answer(useful_prompt.format(original_question,
                                                    relevant))
    return {
            'original_question' : original_question,
            'rephrased_question' : rephrased_question,
            'options': options,
            'rag_information' : context,
            'cot_output' : cot_output,
            'answer_idx' : answer_idx,
            'faith_score' : faith_score

    }

In [11]:
def faithfulness_agent(state):
    faithfulness_llm = Faithulness(GPT())

    original_question = state.get('original_question', '').strip()
    rephrased_question = state.get('rephrased_question', '').strip()
    options = state.get('options', '').strip()
    rag_information = state.get('rag_information', '').strip()
    answer = state.get('answer', '')
    faith_score = state.get('faith_score', '')

    question_plus_options = f'{original_question}\n{options}'
    faith_score = faithfulness_llm.evaluate(question_plus_options, answer, rag_information)

    return {
            'original_question' : original_question,
            'rephrased_question' : rephrased_question,
            'options': options,
            'rag_information' : rag_information,
            'answer' : answer,
            'faith_score' : faith_score

    }

In [None]:
medqa_workflow = StateGraph(GraphState)

medqa_workflow.add_node("rephraser_agent", rephraser_agent)
medqa_workflow.add_node("cot_agent", cot_agent)
medqa_workflow.add_node("rag_agent", rag_agent)
medqa_workflow.add_node("faithfulness_agent", faithfulness_agent)

medqa_workflow.set_entry_point("rephraser_agent")
medqa_workflow.add_edge("rephraser_agent","rag_agent")
medqa_workflow.add_edge("rag_agent","cot_agent")
medqa_workflow.add_edge("cot_agent","faithfulness_agent")
medqa_workflow.add_edge("faithfulness_agent",END)


<langgraph.graph.state.StateGraph at 0x786042cd2770>

In [None]:
state = {
    'original_question' : train_subset[0]['question'],
    'rephrased_question' : '',
    'options':str(train_subset[0]['options']),
    'rag_information' : '',
    'cot_output' : '',
    'faith_score': 0
}

In [None]:
app = medqa_workflow.compile()
conversation = app.invoke(state)

In [None]:
conversation

## Strategy Testing functions

In [66]:
def test_model_without_agents(test_set,llm):
    query_prompt = """Given the following question and options, return the answer in the following format:
    Answer: (option from A to E)

    Question: {}
    Options:{}"""

    test_results = {'questions':[],
                    'options':[],
                    'answer_idx': [],
                    'llm_answer': []}

    for item in test_set:
        question = item['question']
        options = item['options']
        answer_idx = item['answer_idx']



        llm_answer = llm.get_answer(query_prompt.format(question, options))
        print(query_prompt.format(question, options))
        print(llm_answer)
        test_results['questions'].append(question)
        test_results['options'].append(options)
        test_results['answer_idx'].append(answer_idx)
        test_results['llm_answer'].append(llm_answer.split('Answer:')[1].strip())

    return test_results

In [None]:

from tqdm import tqdm
def load_checkpoint(filepath):
    """Loads checkpoint data from CSV and returns it as a dictionary."""
    df = pd.read_csv(filepath)
    return {
        'questions': df['questions'].tolist(),
        'options': df['options'].tolist(),
        'answer_idx': df['answer_idx'].tolist(),
        'llm_answer': df['llm_answer'].tolist(),
        'cot_output': df['cot_output'].tolist(),
        'faithfulness': df['faithfulness'].tolist(),
        'rag_information': df['rag_information'].tolist()
    }, len(df)


def test_workflow(workflow, test_set, test_type='all', checkpoint_path = None):
    # Load last checkpoint if available
    if checkpoint_path is not None:
        print(f"Resuming from checkpoint: {checkpoint_path}")
        test_results, start_index = load_checkpoint(checkpoint_path)

    else:
        print("Starting a new experiment.")
        test_results = {
            'questions': [],
            'options': [],
            'answer_idx': [],
            'llm_answer': [],
            'cot_output': [],
            'faithfulness': [],
            'rag_information': []
        }
        start_index = 0

    app = workflow.compile()
    checkpoint_interval = max(1, len(test_set) // 10)  # Calculate 10% interval

    # Continue from the last processed sample
    for i, item in enumerate(tqdm(test_set[start_index:], desc="Testing workflow"), start=start_index + 1):
        question = item['question']
        options = item['options']
        answer_idx = item['answer_idx']
        state = {
            'original_question': question,
            'rephrased_question': '',
            'options': str(options),
            'rag_information': '',
            'cot_output': '',
            'faith_score': 0
        }

        conversation = app.invoke(state)
        llm_answer = conversation['cot_output'].split('Answer:')[-1].strip()

        test_results['questions'].append(question)
        test_results['options'].append(options)
        test_results['answer_idx'].append(answer_idx)
        test_results['llm_answer'].append(llm_answer)
        test_results['cot_output'].append(conversation['cot_output'])
        test_results['rag_information'].append(conversation.get('rag_information', ''))
        test_results['faithfulness'].append(conversation['faith_score'])

        # Save checkpoint every 10% of the process
        if i % checkpoint_interval == 0:
            df = pd.DataFrame(test_results)
            df.to_csv(f'test_results_checkpoint_{test_type}_{i // checkpoint_interval}.csv', index=False)

    df = pd.DataFrame(test_results)
    df.to_csv(f'test_results_final_{test_type}.csv', index=False)

    return test_results

# Tests

## Testing Chunking Strategies

In [None]:
pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [4]:
model_name = "sentence-transformers/all-mpnet-base-v2"
path = 'medical_textbooks/'
rag_module = RAG(model_name, path)

  self.embedding_model = HuggingFaceEmbeddings(


In [None]:
dataset_path = 'data_clean/'

rag_module.create_vectorstore(dataset_path)

In [None]:
query = "What substances are nonresearch use illegal under federal law?"

rag_module.retrieve_documents(query, 2)

['(All nonresearch use illegal under federal law.)\n\n\n\nFlunitrazepam (Rohypnol) Narcotics:\n\n\n\nHallucinogens:\n\n\n\nLSD MDA, STP, DMT, DET, mescaline, peyote, bufotenine, ibogaine, psilocybin, phencyclidine (PCP; veterinary drug only) (No telephone prescriptions, no refills.)2\n\n\n\nOpioids: Opium: Opium alkaloids and derived phenanthrene alkaloids: codeine, morphine (Avinza, Kadian, MSContin, Roxanol), hydrocodone and hydrocodone combinations (Zohydro ER, Hycodan, Vicodin, Lortab), hydromorphone (Dilaudid), oxymorphone (Exalgo), oxycodone (dihydroxycodeinone, a component of Oxycontin, Percodan, Percocet, Roxicodone, Tylox)\n\n\n\nDesignated synthetic drugs: meperidine (Demerol), methadone, levorphanol (Levo-Dromoran), fentanyl (Duragesic, Actiq, Fentora), alfentanil (Alfenta), sufentanil (Sufenta), remifentanil (Ultiva), tapentadol (Nycynta)\n\n\n\nStimulants: Coca leaves and cocaine Amphetamines: Amphetamine complex (Biphetamine), Amphetamine salts (Adderall), Dextroamphetami

In [None]:

model_name = "sentence-transformers/all-mpnet-base-v2"
path_recursive = 'medical_textbooks_recursive/'
rag_module_recursive = RAG(model_name, path_recursive)





In [None]:
dataset_path = 'data_clean/'
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
rag_module_recursive.create_vectorstore(dataset_path, text_splitter = text_splitter)

In [None]:
query = "What substances are nonresearch use illegal under federal law?"

rag_module_recursive.retrieve_documents(query, 2)

['(All nonresearch use illegal under federal law.)\n\n\n\nFlunitrazepam (Rohypnol) Narcotics:\n\n\n\nHallucinogens:\n\n\n\nLSD MDA, STP, DMT, DET, mescaline, peyote, bufotenine, ibogaine, psilocybin, phencyclidine (PCP; veterinary drug only) (No telephone prescriptions, no refills.)2\n\n\n\nOpioids: Opium: Opium alkaloids and derived phenanthrene alkaloids: codeine, morphine (Avinza, Kadian, MSContin, Roxanol), hydrocodone and hydrocodone combinations (Zohydro ER, Hycodan, Vicodin, Lortab), hydromorphone (Dilaudid), oxymorphone (Exalgo), oxycodone (dihydroxycodeinone, a component of Oxycontin, Percodan, Percocet, Roxicodone, Tylox)\n\n\n\nDesignated synthetic drugs: meperidine (Demerol), methadone, levorphanol (Levo-Dromoran), fentanyl (Duragesic, Actiq, Fentora), alfentanil (Alfenta), sufentanil (Sufenta), remifentanil (Ultiva), tapentadol (Nycynta)',
 'Stimulants:\n\n\n\nDepressants:\n\n\n\nSchedule II barbiturates in mixtures with noncontrolled drugs or in suppository dosage form Ba

In [None]:
context_relevance_metric = ContextRelevance(gpt4o)

In [None]:
#Criando subset de treino
path_questions ='data_clean/questions/US/train.jsonl'
with open(path_questions, 'r') as file:
    train_set = [json.loads(line) for line in file]
train_set[0]

import random
random.seed(15)
random.shuffle(train_set)


train_subset = train_set[:200]

In [None]:
train_subset

[{'question': 'A 75-year-old man is brought to the emergency department for a 5-day-history of worsening dyspnea, orthopnea, and lower leg swelling. He has a history of hypertension, hyperlipidemia, non-alcoholic fatty liver disease, and myocardial infarction 10 years ago. Current medications include metoprolol, lisinopril, ethacrynic acid, eplerenone, and aspirin. He drinks 1 beer daily. He has a 30-pack-year smoking history. He is allergic to sulfonamides. His temperature is 37.0°C (98.6°F), his pulse is 120/min, and his blood pressure is 120/80 mm Hg. Physical examination reveals jugular venous distention and 3+ pitting edema in his lower legs. Crackles are heard at both lung bases. The point of maximal impulse is 2 cm to the left of the midclavicular line in the 6th intercostal space. Which of the following additional findings would be most strongly associated with increased mortality?',
  'answer': 'Decreased serum Na+',
  'options': {'A': 'Decreased BNP levels',
   'B': 'Decrease

In [None]:
def convert_options_to_string(d):
    result = ""
    for key, value in d.items():
        if value:
            result += f"{key}:{value}\n"
        else:
            result += f"{key}:\n"
    return result


In [None]:
train_subset[0]

{'question': 'A 75-year-old man is brought to the emergency department for a 5-day-history of worsening dyspnea, orthopnea, and lower leg swelling. He has a history of hypertension, hyperlipidemia, non-alcoholic fatty liver disease, and myocardial infarction 10 years ago. Current medications include metoprolol, lisinopril, ethacrynic acid, eplerenone, and aspirin. He drinks 1 beer daily. He has a 30-pack-year smoking history. He is allergic to sulfonamides. His temperature is 37.0°C (98.6°F), his pulse is 120/min, and his blood pressure is 120/80 mm Hg. Physical examination reveals jugular venous distention and 3+ pitting edema in his lower legs. Crackles are heard at both lung bases. The point of maximal impulse is 2 cm to the left of the midclavicular line in the 6th intercostal space. Which of the following additional findings would be most strongly associated with increased mortality?',
 'answer': 'Decreased serum Na+',
 'options': {'A': 'Decreased BNP levels',
  'B': 'Decreased se

In [None]:
train_subset[0]['question'] + '\n'+ convert_options_to_string(train_subset[0]['options'])

'A 75-year-old man is brought to the emergency department for a 5-day-history of worsening dyspnea, orthopnea, and lower leg swelling. He has a history of hypertension, hyperlipidemia, non-alcoholic fatty liver disease, and myocardial infarction 10 years ago. Current medications include metoprolol, lisinopril, ethacrynic acid, eplerenone, and aspirin. He drinks 1 beer daily. He has a 30-pack-year smoking history. He is allergic to sulfonamides. His temperature is 37.0°C (98.6°F), his pulse is 120/min, and his blood pressure is 120/80 mm Hg. Physical examination reveals jugular venous distention and 3+ pitting edema in his lower legs. Crackles are heard at both lung bases. The point of maximal impulse is 2 cm to the left of the midclavicular line in the 6th intercostal space. Which of the following additional findings would be most strongly associated with increased mortality?\nA:Decreased BNP levels\nB:Decreased serum Na+\nC:Decreased QRS complex duration\nD:Increased VO2\nE:Increased 

In [None]:
train_set_results = {
    'questions': [],
    'contexts_semantic': [],
    'extracted_sentences_semantic':[],
    'context_relevance_semantic':[],
    'contexts_recursive':[],
    'extracted_sentences_recursive':[],
    'context_relevance_recursive':[],
}

for item in train_subset:
    question = f'{item["question"]}\n{convert_options_to_string(item["options"])}'

    #Avaliando retriever semantico
    context_semantic = rag_module.retrieve_documents(question, num_docs = 2)
    context_relevance_semantic, extracted_sentences_semantic = context_relevance_metric.evaluate(question, context_semantic)

    #Avaliando Retriever recursivo
    context_recursive = rag_module_recursive.retrieve_documents(question, num_docs = 2)
    context_relevance_recursive,  extracted_sentences_recursive = context_relevance_metric.evaluate(question, context_recursive)

    train_set_results['questions'].append(question)

    train_set_results['contexts_semantic'].append(context_semantic)
    train_set_results['extracted_sentences_semantic'].append(extracted_sentences_semantic)
    train_set_results['context_relevance_semantic'].append(context_relevance_semantic)

    train_set_results['contexts_recursive'].append(context_recursive)
    train_set_results['extracted_sentences_recursive'].append(extracted_sentences_recursive)
    train_set_results['context_relevance_recursive'].append(context_relevance_recursive)


In [None]:
results_df = pd.DataFrame.from_dict(train_set_results)
results_df

Unnamed: 0,questions,contexts_semantic,extracted_sentences_semantic,context_relevance_semantic,contexts_recursive,extracted_sentences_recursive,context_relevance_recursive
0,A 75-year-old man is brought to the emergency ...,[The pain is accompanied by dyspnea (shortness...,[Insufficient Information],0.000000,[T AB LE 2.1 -2. Supraventricular Tachyarrhyt...,[Insufficient Information],0.00
1,A 72-year-old man with a 4-year history of Par...,[Central Nervous System Disorders 1. Parkinson...,"[1, ""With continued treatment, however, the du...",0.093750,"[When symptomatic treatment becomes necessary,...",[Insufficient Information],0.00
2,A 5-week-old male infant is brought to the Eme...,[The chest should be auscultated for evidence ...,"[1, ""Pyloric stenosis occurs in the first mont...",0.178571,"[In neonates with true vomiting, congenital ob...","[""Pyloric stenosis occurs in the first months ...",0.10
3,A 65-year-old man presents to his primary care...,[479e. Diseases with fever and rash may be cla...,[Insufficient Information],0.000000,"[The age of the patient, onset, duration, prog...",[Insufficient Information],0.00
4,A 60-year-old woman is brought to the emergenc...,[One of our patients had intermittent seizures...,[Insufficient Information],0.000000,[Cerebral edema is the accumulation of excess ...,[Insufficient Information],0.00
...,...,...,...,...,...,...,...
195,A 33-year-old female presents with recent onse...,[D. May slowly progress to renal failure\n\n\n...,"[1, ""Infection of the kidney""\n2, ""Presents wi...",0.095238,"[Culture-greater than 100,000 colony forming u...","[1, ""A, Infection of the kidney 1, Usually due...",0.20
196,A 25-year-old man presents with a nodule on hi...,[Subcutaneous nodular lesions have also been i...,[Insufficient Information],0.000000,[Hepatosplenomegaly and lymphadenopathy are ea...,[Insufficient Information],0.00
197,A molecular biologist is studying the roles of...,[The\n\n\n\nNav1.5 SCN5A\n\n\n\nCav1.2 CACNA1C...,[Insufficient Information],0.000000,[0 1.5 3 4.5 10 msec pA Channel #1 current Cha...,"[During the action potential plateau, Ca++ ent...",0.25
198,A 37-year-old obese woman presents to the neur...,[A 35-year-old woman comes to her physician co...,[Insufficient Information],0.000000,[syndrome is commonly associated with pregnanc...,[Insufficient Information],0.00


In [None]:
results_df['questions'].iloc[0]

'A 75-year-old man is brought to the emergency department for a 5-day-history of worsening dyspnea, orthopnea, and lower leg swelling. He has a history of hypertension, hyperlipidemia, non-alcoholic fatty liver disease, and myocardial infarction 10 years ago. Current medications include metoprolol, lisinopril, ethacrynic acid, eplerenone, and aspirin. He drinks 1 beer daily. He has a 30-pack-year smoking history. He is allergic to sulfonamides. His temperature is 37.0°C (98.6°F), his pulse is 120/min, and his blood pressure is 120/80 mm Hg. Physical examination reveals jugular venous distention and 3+ pitting edema in his lower legs. Crackles are heard at both lung bases. The point of maximal impulse is 2 cm to the left of the midclavicular line in the 6th intercostal space. Which of the following additional findings would be most strongly associated with increased mortality?\nA:Decreased BNP levels\nB:Decreased serum Na+\nC:Decreased QRS complex duration\nD:Increased VO2\nE:Increased 

In [None]:
results_df['contexts_semantic'].iloc[0]

['The pain is accompanied by dyspnea (shortness of breath), diaphoresis (sweating), and nausea. Focused History: BJ reports episodes of exertional chest pain in the last few months, but they were less severe and of short duration. He smokes (2–3 packs per day), drinks alcohol only rarely, eats a “typical” diet, and walks with his wife most weekends. His blood pressure has been normal. Family history reveals that his father and paternal aunt died of heart disease at age 45 and 39 years, respectively. His mother and younger (age 31 years) brother are said to be in good health. Physical Examination (Pertinent Findings): BJ is pale and clammy and is in distress due to chest pain. Blood pressure and respiratory rate are elevated. Lipid deposits are noted on the periphery of his corneas (corneal arcus; see left image) and under the skin on and around his eyelids (xanthelasmas; see right image). No deposits on his tendons (xanthomas) are detected. Pertinent Test Results: BJ’s electrocardiogra

In [None]:
results_df['contexts_recursive'].iloc[0]

['T AB LE 2.1 -2.  Supraventricular Tachyarrhythmias\n\n\n\nAtrial Sinus tachycardia Normal physiologic response to fear, pain, and exercise. Can also be 2° to hyperthyroidism, volume contraction, infection, or pulmonary embolism. Palpitations, shortness of breath. Ventricular rate > 100 bpm; normal P waves before every QRS complex. Treat the underlying cause. Atrial f brillation (AF) Acute AF—   PIRATES: Pulmonary disease Ischemia Rheumatic heart disease Anemia/Atrial myxoma Thyrotoxicosis Ethanol Sepsis Chronic AF— hypertension, CHF. Often asymptomatic, but may present with shortness of breath, chest pain, or palpitations. Physical exam reveals irregularly irregular pulse. No discernible P waves, with variable and irregular QRS response.\n\n\n\nEstimate risk of stroke using CHAD2 score.\n\n\n\nAnticoagulation if > 48 hours (to prevent CVA); rate control (CCBs, β-blockers, digoxin, amiodarone).',
 'PART 2 Cardinal Manifestations and Presentation of Diseases\n\n\n\nVital Signs Signific

In [None]:
results_df.to_csv('retriever_context_relevance.csv', index = None)

In [None]:
results_df.head(15)

Unnamed: 0,questions,contexts_semantic,extracted_sentences_semantic,context_relevance_semantic,contexts_recursive,extracted_sentences_recursive,context_relevance_recursive
0,A 75-year-old man is brought to the emergency ...,[The pain is accompanied by dyspnea (shortness...,[Insufficient Information],0.0,[T AB LE 2.1 -2. Supraventricular Tachyarrhyt...,[Insufficient Information],0.0
1,A 72-year-old man with a 4-year history of Par...,[Central Nervous System Disorders 1. Parkinson...,"[1, ""With continued treatment, however, the du...",0.09375,"[When symptomatic treatment becomes necessary,...",[Insufficient Information],0.0
2,A 5-week-old male infant is brought to the Eme...,[The chest should be auscultated for evidence ...,"[1, ""Pyloric stenosis occurs in the first mont...",0.178571,"[In neonates with true vomiting, congenital ob...","[""Pyloric stenosis occurs in the first months ...",0.1
3,A 65-year-old man presents to his primary care...,[479e. Diseases with fever and rash may be cla...,[Insufficient Information],0.0,"[The age of the patient, onset, duration, prog...",[Insufficient Information],0.0
4,A 60-year-old woman is brought to the emergenc...,[One of our patients had intermittent seizures...,[Insufficient Information],0.0,[Cerebral edema is the accumulation of excess ...,[Insufficient Information],0.0
5,A 26-year-old male is brought into the emergen...,[Was MW in positive or negative nitrogen balan...,[Insufficient Information],0.0,[The plasma Na+ concentration on admission was...,[Insufficient Information],0.0
6,"A 31-year-old male with cirrhosis, dementia, a...",[■↓ ceruloplasmin and excessive deposition of ...,[■↓ ceruloplasmin and excessive deposition of ...,0.041096,[Correct answer = H. The patient has Wilson di...,"[""The patient has Wilson disease, an autosomal...",0.090909
7,"An experimental drug, ES 62, is being studied....","[−K\n\n\n\nThe higher the Km, the lower the af...",[Insufficient Information],0.0,[C. Volume of Distribution\n\n\n\nThe apparent...,[The volume of distribution is the proportiona...,0.125
8,Patient 1 – A 26-year-old woman presents to he...,"[Dunnwald LK, Rossing MA, Li CI. Hormone recep...",[Insufficient Information],0.0,[If the presence of breast cancer is strongly ...,[Insufficient Information],0.0
9,"A 45-year-old woman, gravida 3, para 2, at 18 ...",[1 107\n\n\n\nSCREENING AND DIAGNOSIS ...........,[Insufficient Information],0.0,"[Rood K, Markham KB: Torsion of a term gravid ...",[Insufficient Information],0.0


In [None]:
results_df['context_relevance_recursive'].mean()

0.0367118691851664

In [None]:
results_df['context_relevance_semantic'].mean()

0.015717990033766043

## Creating test set and subset

In [17]:


path_questions ='data_clean/questions/US/test.jsonl'
with open(path_questions, 'r') as file:
    test_set = [json.loads(line) for line in file]
test_set[0]

import random
random.seed(15)
random.shuffle(test_set)


test_subset = test_set[:50]

In [None]:
test_set[0]

{'question': 'A 32-year-old man with HIV comes to the physician because of a 2-month history of weight loss, night sweats, and productive cough. Auscultation of the lungs shows coarse crackles at the right upper posterior field. An x-ray of the chest shows an opacity in the right upper lobe. Sputum analysis shows acid-fast bacilli. A small amount of tuberculin fluid is injected into the subcutaneous tissue on the left forearm. Examination of the injected area 48 hours later shows no induration or erythema. Impairment of which of the following processes is the most likely cause of the negative tuberculin skin test seen in this patient?',
 'answer': 'Interaction of B7 and CD28 ligands',
 'options': {'A': 'Secretion of interferon-α',
  'B': 'Interaction of B7 and CD28 ligands',
  'C': 'Opsonization by complement proteins',
  'D': 'Secretion of interleukin-4',
  'E': 'Generation of reactive oxygen species'},
 'meta_info': 'step1',
 'answer_idx': 'B'}

## Testing GPT4o-mini without agents

In [67]:
llm = GPT()
results_without_agents = test_model_without_agents(test_set, llm)

Given the following question and options, return the answer in the following format:
    Answer: (option from A to E)

    Question: An investigator is studying the effects of different gastrointestinal regulatory substances. A healthy subject is asked to eat a meal at hour 0, and the pH of stomach contents and rate of stomach acid secretions are measured over the next 4 hours. Results of the study are shown. Which of the following mechanisms most likely contributes to the changes seen at point D in the graph?
    Options:{'A': 'Increased vagal stimulation', 'B': 'Increased activity of D cells', 'C': 'Increased activation of H2 receptors', 'D': 'Increased activity of enterochromaffin-like cells', 'E': 'Increased activity of I cells'}
To determine the mechanism contributing to the changes seen at point D in the study of gastrointestinal regulatory substances, we need to consider the physiological responses to a meal and the roles of the various cells and mechanisms listed in the options

In [None]:
sum([results_without_agents['answer_idx'][i] == results_without_agents['llm_answer'][i] for i in range(len(results_without_agents['questions']))])/len(results_without_agents['questions'])

0.6897093479968578

In [None]:
results_without_agents_df = pd.DataFrame.from_dict(results_without_agents)
results_without_agents_df.to_csv('results_without_agents.csv', index = None)

## Testing CoT Agent

In [None]:
re_write_workflow = StateGraph(GraphState)

re_write_workflow.add_node("cot_agent", cot_agent)


re_write_workflow.set_entry_point("cot_agent")
re_write_workflow.add_edge("cot_agent",END)



<langgraph.graph.state.StateGraph at 0x786060318340>

In [None]:
results_with_cot = test_workflow(re_write_workflow, test_set)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m

Let's analyze the options given:

- **A: Conjugation** - This is a process where bacteria transfer genetic material through direct cell-to-cell contact. It typically requires a conjugative plasmid and is not indicated as directly relevant in this scenario as there is no mention of direct cell interaction.

- **B: Reassortment** - This is a process seen typically in viruses, particularly segmented viruses. It does not apply to bacterial exchanges.

- **C: Transformation** - This process involves a bacterium taking up free DNA from its environment (which is what would happen with the lysed bacteria). This aligns with the scenario since the noninfectious bacteria could take up virulence genes from the infectious bacteria's lysed contents.

- **D: Generalized transduction** - This process involves transferring of DNA from one bacterium to another via a bacteriophage. There's no mention of phages in the scenario, thus

In [None]:
sum([results_with_cot['answer_idx'][i] == results_with_cot['llm_answer'][i] for i in range(len(results_with_cot['questions']))])/len(results_with_cot['questions'])

0.7195600942655145

In [None]:
results_with_cot_df = pd.DataFrame.from_dict(results_with_cot)
results_with_cot_df.to_csv('results_with_cot_all.csv', index = None)

## Testing with Rewrite + CoT agents

In [None]:
re_write_workflow = StateGraph(GraphState)

re_write_workflow.add_node("rephraser_agent", rephraser_agent)
re_write_workflow.add_node("cot_agent", cot_agent)


re_write_workflow.set_entry_point("rephraser_agent")
re_write_workflow.add_edge("rephraser_agent","cot_agent")
re_write_workflow.add_edge("cot_agent",END)



<langgraph.graph.state.StateGraph at 0x7f3ed39eb820>

In [None]:
results_with_rephrasal = test_workflow(re_write_workflow, test_set)

-----CoT Prompt-----
Your role is to be a medical assistant for performing Q&A
in the medical domain. Given the following question and options, return the
correct option thinking step-by-step on how to get to the final answer.
Return the answer in the following format:
Answer: (option from A to E)
Question: A 64-year-old male presents to his primary care provider with a chief complaint of bilateral knee arthralgia, characterized by exacerbation over several years, particularly noted during activities such as stair ascent and ambulation exceeding 100 yards. The patient describes the nociception intensifying with physical exertion and experiencing symptomatic relief via rest periods. He reports minimal matutinal stiffness lasting approximately 5-10 minutes upon awakening. The physical examination reveals tenderness on palpation of the medial tibiofemoral compartment of both knees, alongside crepitus and a limited range of motion at the extremes of flexion and extension. Both knee joints 

Testing workflow:  98%|█████████████████████████████████████████████████████████▋ | 1244/1273 [3:59:28<08:12, 16.98s/it]

-----CoT Output-------
To determine the correct option for further evaluation of the condition presented in the scenario, let’s analyze the symptoms and findings.

1. **Patient's Age and Symptoms**: The patient is a 64-year-old male with bilateral knee arthralgia that has worsened over several years. Symptoms are aggravated by physical activity and relieved by rest, which indicates a potential degenerative process rather than inflammatory arthritis.

2. **Morning Stiffness**: The patient reports only minimal morning stiffness lasting around 5-10 minutes, which is not typical for inflammatory arthritis conditions that usually present with longer periods of stiffness.

3. **Physical Examination Findings**: 
   - Tenderness primarily in the medial tibiofemoral compartment suggests an inner knee issue.
   - Presence of crepitus and limited range of motion at extremes of knee movement suggest degenerative changes.
   - Coolness of joints and bony enlargement at the medial joint line points 

Testing workflow:  98%|█████████████████████████████████████████████████████████▋ | 1245/1273 [3:59:39<07:06, 15.25s/it]

-----CoT Output-------
To determine the correct physiological consequence of the observed impaired renal perfusion in this patient, we first need to analyze the situation step-by-step:

1. **Patient Background**: The patient has a history of cardiovascular issues and high blood pressure, and his blood pressure is currently poorly controlled despite pharmacotherapy.

2. **Findings**: The renal duplex ultrasonography identifies a 90% stenosis of the right renal artery. This indicates a significant reduction in blood flow to this kidney, leading to impaired renal perfusion.

3. **Renal Physiology**: The kidneys regulate blood pressure through several mechanisms, primarily involving the renin-angiotensin-aldosterone system (RAAS). Impaired renal perfusion typically leads to activation of this system.

4. **Juxtaglomerular Apparatus**: The juxtaglomerular cells, found in the kidney, act in response to low perfusion pressure by secreting renin. Renin secretion increases in response to low re

Testing workflow:  98%|█████████████████████████████████████████████████████████▋ | 1246/1273 [3:59:49<06:05, 13.53s/it]

-----CoT Output-------
To determine the most likely etiological agent responsible for the clinical presentation of the 5-year-old male, we need to consider the symptoms and the specific characteristics of the lesions described:

1. **Symptoms**: The child presents with oral mucosal pain, vesicular lesions in the oral cavity (labial and buccal regions), and significant difficulty with oral intake. There is also fever (39.1°C) and irritability, which indicate a possible viral infection.

2. **Physical Examination Findings**: The presence of vesicular lesions on the tongue (lingual surface), gingival tissue, and labial area is particularly indicative of a herpes simplex virus (HSV) infection, specifically affecting the oral region.

3. **Associated Symptoms**: The noted cervical and submandibular lymphadenopathy further supports a viral infection, as this type of lymphadenopathy is commonly seen with infections such as those caused by herpes viruses.

4. **Differential Diagnosis of Option

Testing workflow:  98%|█████████████████████████████████████████████████████████▊ | 1247/1273 [4:00:00<05:31, 12.75s/it]

-----CoT Output-------
To determine the specific complication that this 3-year-old male patient is at the highest risk of developing, we need to analyze the provided clinical information:

1. **Symptom Overview**: The patient presents with lethargy, jaundice (icterus), and splenomegaly. These symptoms, particularly the jaundice and splenomegaly, suggest an underlying hematological issue.

2. **History of Infection**: The patient had an upper respiratory tract infection a week prior to his symptoms. In some cases, infections can trigger or exacerbate underlying conditions.

3. **Laboratory Findings**: 
   - The hemoglobin level of 9.4 g/dL indicates anemia.
   - The mean corpuscular hemoglobin concentration (MCHC) of 39% Hb/cell suggests a hyperchromic anemia.
   - A negative Coombs test indicates the absence of autoimmune hemolysis, which may point towards another cause of anemia.

4. **Peripheral Blood Smear**: The specific findings on the smear are not given, but the symptoms and lab

Testing workflow:  98%|█████████████████████████████████████████████████████████▊ | 1248/1273 [4:00:20<06:14, 14.97s/it]

-----CoT Output-------
To analyze the clinical presentation of the 57-year-old female and correlate it with the potential findings listed in the options, let's break down the aspects of her condition:

1. **Clinical Symptoms and History**: 
   - She has asthenia (weakness), pruritus (itching), and jaundice (scleral icterus).
   - The presence of xerostomia (dry mouth), soft abdomen, and hepatomegaly (enlarged liver) are important indicators.
   - The generalized excoriations may be related to itching.

2. **Laboratory Findings**:
   - Elevated total bilirubin with a high direct component suggests a conjugated hyperbilirubinemia, which aligns with issues related to liver function or bile flow.
   - Elevated alkaline phosphatase levels are significant for cholestasis or biliary obstruction.
   - AST and ALT levels are elevated, indicating liver inflammation.
   - Negative tests for Hepatitis B and C, coupled with positive Hepatitis B surface antibody, suggests prior vaccination or resolv

Testing workflow:  98%|█████████████████████████████████████████████████████████▉ | 1249/1273 [4:00:49<07:40, 19.18s/it]

-----CoT Output-------
To determine the correct answer regarding anticipated laboratory findings for the 23-year-old female patient after her acute hemorrhagic episode and anaphylactic response post-transfusion, we need to analyze the situation step-by-step.

1. **Patient Background**: The patient has no known comorbidities or hypersensitivities, which makes it less likely that she has a history of immune deficiencies impacting her immunoglobulins prior to this event.

2. **Symptoms After Transfusion**: The patient showed an anaphylactic response (increased irritability and respiratory distress) post-transfusion. This points to a potential allergic reaction, which can sometimes be associated with immunological abnormalities.

3. **Consideration of Immunodeficiency**: The attending physician suspects an undiagnosed immunodeficiency disorder, commonly involving specific immunoglobulin deficiencies.

4. **Immunoglobulin Classifications**: In patient immunology, deficiencies can often be c

Testing workflow:  98%|█████████████████████████████████████████████████████████▉ | 1250/1273 [4:01:05<07:02, 18.35s/it]

-----CoT Output-------
To determine the correct option regarding the characteristic finding associated with this patient’s condition, we need to analyze the information provided in the clinical scenario.

1. **Patient History and Condition**:
   - The patient is a 59-year-old male with a history of hepatic cirrhosis secondary to chronic alcoholism.
   - He presents with melena, jaundice, ascites, and neurological signs such as asterixis.
   - The testing shows altered liver enzymes and signs of portal hypertension (e.g., ascites).

2. **Understanding the Clinical Picture**:
   - The combination of hepatic cirrhosis, ascites, and asterixis suggests hepatic encephalopathy, which is a decline in brain function due to liver failure and the inability to detoxify ammonia.
   - Asterixis (flapping tremors) is specifically associated with hepatic encephalopathy.

3. **Reviewing the Options**:
   - **A: It carries a good prognosis**: This is generally not true for hepatic encephalopathy associa

Testing workflow:  98%|█████████████████████████████████████████████████████████▉ | 1251/1273 [4:01:19<06:14, 17.02s/it]

-----CoT Output-------
To determine the correct answer, we need to assess the potential complications related to the use of low-dose corticosteroid therapy in this patient with systemic lupus erythematosus (SLE). 

1. **Understanding the effects of corticosteroids**: Corticosteroids can lead to a variety of side effects, including osteoporosis, which increases the risk of fractures. Long-term corticosteroid use is particularly associated with an increased risk of fractures of the proximal femur (femoral neck fractures) and vertebral bodies, due to decreased bone mineral density.

2. **Analyzing the provided options**:
   - **A: Femoral neck fracture**: Given that corticosteroid therapy increases the risk of osteoporosis and subsequent femoral neck fractures, this is a plausible complication.
   - **B: Femoral shaft fracture**: Although possible, it is less commonly associated with corticosteroid therapy compared to femoral neck fractures.
   - **C: Meniscal tear**: This is generally mo

Testing workflow:  98%|██████████████████████████████████████████████████████████ | 1252/1273 [4:01:29<05:07, 14.66s/it]

-----CoT Output-------
To answer the question, we need to analyze the clinical scenario and each of the provided options carefully.

1. **Clinical Synopsis**: 
   - The patient is a 26-year-old male presenting with:
     - Intermittent fever (pyrexia)
     - Night sweats (nocturnal hyperhidrosis)
     - Significant weight loss (6 kg in 2 months)
     - Nontender lymphadenopathy (in cervical and supraclavicular regions)
     - Bilateral mediastinal masses seen on chest X-ray.

2. **Key Symptoms**: 
   - The combination of intermittent fever, night sweats, weight loss, and lymphadenopathy is suggestive of a possible hematologic malignancy, particularly a lymphoma (such as Hodgkin’s or Non-Hodgkin’s Lymphoma).

3. **Histopathological Evaluation**: 
   - The options suggest various pathological findings; we need to correlate these with the likely diagnosis of lymphoma given the symptoms and findings. 

4. **Option Analysis**:
   - **A: Cells staining positive for tartrate-resistant acid ph

Testing workflow:  98%|██████████████████████████████████████████████████████████ | 1253/1273 [4:01:43<04:53, 14.68s/it]

-----CoT Output-------
To determine the underlying condition contributing to the development of respiratory failure in the described patient, we should analyze the clinical details and laboratory findings step-by-step:

1. **Patient Presentation**: The patient is a 49-year-old female with acute onset of dyspnea and cough, low oxygen saturation, and a history of smoking, type 2 diabetes, and recent surgery. She shows signs of hypoxemia and has elevated vital signs indicating possible respiratory distress.

2. **Vital Signs**: The tachycardia (heart rate of 101 bpm), elevated blood pressure (155/80 mm Hg), and increased respiratory rate (31 breaths/min) along with her low-grade fever suggest a stress response likely due to respiratory distress or infection.

3. **Blood Gas Analysis**: The arterial blood gas analysis shows:
   - pH: 7.49 indicating alkalosis (likely respiratory since PaCO2 is low).
   - PaO2: 58 mm Hg indicates significant hypoxemia.
   - PaCO2: 30 mm Hg reflects respirat

Testing workflow:  99%|██████████████████████████████████████████████████████████ | 1254/1273 [4:01:54<04:17, 13.55s/it]

-----CoT Output-------
To determine the correct pharmacological mechanism of action for the likely medication administered to the patient described, we first need to analyze the clinical situation.

The patient is presenting with altered consciousness, hypovigilance, dysarthria, bradypnea, bradycardia, and miosis after taking oxycodone, which indicates an opioid overdose. Oxycodone is an opioid that primarily acts as a μ (mu) receptor agonist. Opioids can cause respiratory depression, sedation, and miosis (constricted pupils), which are well-documented effects of opioid toxicity.

In cases of opioid overdose, an opioid antagonist is commonly administered to reverse these symptoms. The most widely used opioid antagonist is naloxone (Narcan), which primarily acts as a μ receptor antagonist. This means it binds to the same μ receptors that opioids like oxycodone bind to, but instead of activating them, it blocks their effects, thus reversing sedation and respiratory depression.

By review

Testing workflow:  99%|██████████████████████████████████████████████████████████▏| 1255/1273 [4:02:05<03:49, 12.74s/it]

-----CoT Output-------
To determine the most plausible etiology for this patient's urinary incontinence, let's analyze the case step-by-step:

1. **Patient Profile**: The patient is a 35-year-old female with a history of urinary incontinence that started during physical activity and has progressed to occurring with laughter and coughing. This suggests a potential issue with pelvic support or control mechanisms rather than a bladder problem.

2. **Symptoms**: The patient reports:
   - Urinary incontinence during physical activity, laughter, and coughing
   - Increased urinary frequency
   - Nocturia (waking up at night to urinate)

3. **Examination Results**: Pelvic examination and urinalysis reveal no abnormalities. The urinary leakage was observed during a Valsalva maneuver, which indicates a likely stress component of incontinence.

4. **History and Risk Factors**: The obstetric history includes vaginal and cesarean deliveries, which may lead to pelvic floor weakness. The absence of 

Testing workflow:  99%|██████████████████████████████████████████████████████████▏| 1256/1273 [4:02:14<03:17, 11.64s/it]

-----CoT Output-------
To determine the correct answer, we need to analyze the patient's symptoms, history, and the findings from the CT scan.

1. **Patient Information**: The patient has a history of hypertension and atrial fibrillation, and has a long smoking history of 45 pack-years. He is currently on warfarin, enalapril, and amiodarone.

2. **Symptoms**: The patient presents with exertional dyspnea (shortness of breath with activity), progressive asthenia (weakness), and a persistent nonproductive cough that has lasted for 6 months.

3. **Physical Exam Findings**: The examination shows digital clubbing and dacryocystic nail deformities. Digital clubbing is often associated with certain chronic respiratory and cardiovascular conditions.

4. **CT Findings**: The CT scan shows subpleural cystic changes and reticular interstitial opacities located predominantly in the lung bases, suggesting interstitial lung disease. The cystic changes can indicate possible underlying pulmonary pathol

Testing workflow:  99%|██████████████████████████████████████████████████████████▎| 1257/1273 [4:02:25<03:00, 11.31s/it]

-----CoT Output-------
To determine the most appropriate next step in the management of this 75-year-old female patient, we need to analyze her presentation and medical history step-by-step.

1. **Presentation of Symptoms**: The patient presented with a transient episode of dysarthria and unilateral upper extremity weakness lasting about 30 minutes. This raises concern for a transient ischemic attack (TIA), which is characterized by temporary neurological deficits due to ischemia that resolve spontaneously.

2. **Medical History**: The patient has several risk factors for cerebrovascular disease including essential hypertension, atrial fibrillation, diabetes mellitus, and peripheral vascular disease. Atrial fibrillation is particularly important as it significantly increases the risk for thromboembolic strokes.

3. **Vital Signs and Neurological Examination**: The elevated blood pressure (184/111 mmHg) may indicate that the patient is experiencing significant stress or compensatory mec

Testing workflow:  99%|██████████████████████████████████████████████████████████▎| 1258/1273 [4:02:32<02:33, 10.23s/it]

-----CoT Output-------
To answer this question, we need to analyze the clinical scenario and the options provided.

The patient has:
- Bilateral pitting edema
- Nephrotic-range proteinuria (exceeding 3.5 grams in 24 hours)
- Suspicion of glomerular injury due to loss of polyanionic charge from the glomerular basement membrane

This scenario suggests the presence of nephrotic syndrome, which is often characterized by:
- Significant proteinuria
- Edema
- Hypoalbuminemia
- Hyperlipidemia

The key feature mentioned is the loss of polyanionic charge from the glomerular basement membrane, which is often associated with conditions like Minimal Change Disease. 

Now, let’s analyze the options:

A: WBC casts in the urine - This is typically associated with glomerulonephritis and inflammatory processes, not nephrotic syndrome.

B: RBC casts in the urine - This suggests glomerular bleeding or glomerulonephritis, not associated with nephrotic syndrome.

C: Selective albuminuria - This is a feature

Testing workflow:  99%|██████████████████████████████████████████████████████████▎| 1259/1273 [4:02:40<02:10,  9.32s/it]

-----CoT Output-------
To determine the most precise indicator of gestational age among the options provided, let's analyze each option step-by-step:

1. **Ratio of head to abdominal circumference (A):** This measurement can be used for assessing growth and development but is less precise in determining fetal gestational age in early pregnancy.

2. **Serum β-hCG levels (B):** While β-hCG levels can provide information about pregnancy (such as confirming that a person is pregnant), they do not correlate closely with gestational age since they can vary widely among individuals and over time.

3. **Femoral length (C):** This measurement is typically used in later weeks of pregnancy, but it is not as precise for determining gestational age early in pregnancy.

4. **Crown-rump length (D):** This is the measurement of the length from the top of the fetus's head (the crown) to the bottom of the buttocks (the rump). It is widely recognized as one of the most accurate measurements for assessing

Testing workflow:  99%|██████████████████████████████████████████████████████████▍| 1260/1273 [4:02:48<01:59,  9.17s/it]

-----CoT Output-------
To determine the New York Heart Association (NYHA) Functional Classification for heart failure for this patient, we need to analyze her symptoms and functional limitations in the context of the NYHA classification system.

The NYHA classification is based on the patient's level of physical activity and the related symptoms:

- **Class I**: No limitations. Regular physical activity does not cause undue fatigue, palpitations, or dyspnea.
- **Class II**: Slight limitation of physical activity. Comfortable at rest, but ordinary physical activity results in fatigue, palpitations, or dyspnea.
- **Class III**: Marked limitation of physical activity. Comfortable at rest, but less than ordinary activity causes fatigue, palpitations, or dyspnea.
- **Class IV**: Unable to carry out any physical activity without discomfort. Symptoms of heart failure are present at rest.

Now, let's analyze the information presented in the question:

1. The patient is experiencing a **signifi

Testing workflow:  99%|██████████████████████████████████████████████████████████▍| 1261/1273 [4:03:00<01:57,  9.82s/it]

-----CoT Output-------
To determine the correct clinical finding that is most likely present in this patient, we need to analyze the symptoms and laboratory findings provided in the scenario.

The patient is a 47-year-old male with:

1. Acute onset of bilateral ocular erythema and discomfort.
2. Visual disturbances and concern for visual impairment.
3. History of escalating lumbar discomfort for two months, characterized by morning stiffness and improvement with activity - suggesting a possible inflammatory spinal condition.
4. Bilateral retrocalcaneal discomfort, which could indicate an inflammatory process affecting the tendons or joints.
5. Elevated erythrocyte sedimentation rate (ESR) and C-reactive protein (CRP) levels, which are indicators of inflammation.

Considering these findings, there are a couple of conditions that might be indicated. The combination of ocular symptoms, back pain, and elevated inflammatory markers might suggest a spondyloarthritis, such as ankylosing spond

Testing workflow:  99%|██████████████████████████████████████████████████████████▍| 1262/1273 [4:03:13<02:00, 10.97s/it]

-----CoT Output-------
To determine the most appropriate diagnostic test that would corroborate the suspected diagnosis in this clinical scenario, let's analyze the key features of the case:

1. **Patient Demographics & Symptoms**: The patient is a 48-year-old Caucasian female who is experiencing progressive myopathy (muscle weakness) and dyspnea (shortness of breath). She has difficulty using her arms for daily activities, suggesting proximal muscle weakness.

2. **Clinical Findings**: The vital signs appear stable. The neurological examination shows reduced muscle strength in the deltoids, indicating weakness in upper extremity proximal muscles. The pulmonary examination shows dry crackles, which could indicate interstitial lung disease or pulmonary involvement.

3. **Skin Examination**: A notable finding is the erythematous rash localized to the malar regions, periorbital areas, and proximal limbs, which is characteristic of dermatomyositis, an inflammatory myopathy.

4. **Potential

Testing workflow:  99%|██████████████████████████████████████████████████████████▌| 1263/1273 [4:03:28<01:59, 11.96s/it]

-----CoT Output-------
To deduce the correct option for the underlying etiology contributing to the patient's symptoms and clinical findings, we will analyze the provided information step-by-step.

1. **Anosmia and Hyposmia**: The patient presents with anosmia (inability to smell) and hyposmia (reduced ability to smell). This suggests a problem with the olfactory system.

2. **Growth Trajectory**: The patient has a declining growth trajectory, moving from the 40th percentile in height to the 15th percentile. This indicates potential underlying issues with growth hormone or general health.

3. **Hypogonadism**: The patient shows signs of hypogonadism including sparse axillary and pubic hair and genital development consistent with Tanner stage 1. This points to a lack of or insufficient testosterone production, commonly linked with conditions that affect puberty onset and sexual maturation.

4. **Prior History**: The patient has a history of cryptorchidism for which he underwent bilatera

Testing workflow:  99%|██████████████████████████████████████████████████████████▌| 1264/1273 [4:03:48<02:10, 14.48s/it]

-----CoT Output-------
To determine which pharmacotherapeutic agent is most likely implicated in the hematological findings presented in the question, let's analyze the situation step-by-step.

1. **Initial Presentation**: The patient shows significant asthenia (fatigue) and has blood work indicating:
   - Anemia (hemoglobin at 10.4 g/dL)
   - Leukopenia (leukocyte count at 800 cells/mm³)
   - Thrombocytopenia (platelet count at 50,000 cells/mm³)

2. **New Pharmacological Agent**: After initiating a new agent, follow-up lab results show:
   - Hemoglobin increases slightly to 10.6 g/dL, indicating some improvement but still anemic.
   - Leukocytosis (leukocyte count increases to 2,000 cells/mm³), indicating an increase in white blood cells but not normal levels.
   - Platelet count slightly increases to 56,000 cells/mm³, still thrombocytopenic.

3. **Analyzing Hematological Changes**:
   - The increase in leukocyte count could suggest that the new agent might be stimulating white blood 

Testing workflow:  99%|██████████████████████████████████████████████████████████▋| 1265/1273 [4:03:58<01:44, 13.00s/it]

-----CoT Output-------
To determine the embryological origin of the tissue described in the case of the 2-year-old patient, we need to analyze the information given.

The key points to note are:
- The presence of a palpable left-sided abdominal mass in a young child.
- Histopathological analysis indicates the presence of primitive blastemal cells and immature nephron structures (which includes tubules and glomeruli).

Now let's analyze the options based on their embryological origins:

- **A: Carotid body** - This structure arises from the neural crest cells and is involved in chemoreception, but it is not related to renal tissues or nephron structures.
  
- **B: Adrenal medulla** - This is derived from neural crest cells as well and contributes to the development of the adrenal glands, specifically the inner part, but this is also not related to nephrons or renal tissues.
  
- **C: Thyroid gland** - It develops from endodermal tissue, specifically the foregut endoderm, and is not asso

Testing workflow:  99%|██████████████████████████████████████████████████████████▋| 1266/1273 [4:04:06<01:22, 11.72s/it]

-----CoT Output-------
To determine the correct option for the underlying pathophysiological mechanism associated with the patient’s primary symptom of retrosternal pyrosis (which indicates heartburn or acid reflux) exacerbated by ingestion, we need to analyze the condition and symptoms presented.

1. **Understanding Retrostenal Pyrosis**: This symptom typically arises due to gastroesophageal reflux disease (GERD), where contents of the stomach move back up into the esophagus, causing irritation and discomfort.

2. **Decreased Lower Esophageal Tone**: One of the most common causes of GERD is decreased tone or relaxation of the lower esophageal sphincter (LES). When the LES does not close properly, acid and other stomach contents can flow back into the esophagus, leading to pyrosis.

3. **Esophageal Fibrosis**: This would indicate a chronic change within the esophagus potentially leading to narrowing or obstruction rather than increased reflux, not typically the primary mechanism leadin

Testing workflow: 100%|██████████████████████████████████████████████████████████▋| 1267/1273 [4:04:20<01:13, 12.17s/it]

-----CoT Output-------
To determine the most appropriate subsequent intervention for the patient described, we need to assess his condition and the potential needs based on the recent laboratory results and provided options.

1. **Patient Assessment**:
   - The patient has a history of intravenous substance use and is currently on methadone, which can cause metabolic issues.
   - He was admitted with unresponsiveness and presents with hypotension, tachycardia, and tachypnea.
   - Laboratory results indicated significant concerns: hyperkalemia, metabolic acidosis, and severe hyperglycemia.

2. **Initial Interventions**:
   - The patient's vital signs and lab values have improved post-treatment, indicating interventions were successful in addressing some acute issues: normalization of potassium, slight improvement of bicarbonate, and resolution of hyperglycemia.

3. **Current Status**:
   - The patient's potassium has normalized (K+ 4.3 mEq/L), indicating that he no longer has a critical

Testing workflow: 100%|██████████████████████████████████████████████████████████▊| 1268/1273 [4:04:28<00:55, 11.15s/it]

-----CoT Output-------
To determine the correct pharmacological agent prescribed to the patient, we need to analyze the clinical context and the options provided.

1. **Patient Profile**: The patient is a 62-year-old male with a history of chronic bronchitis and a significant smoking history. He is experiencing an exacerbation of dyspnea and a productive cough with viscous sputum. Physical examination reveals increased anteroposterior chest diameter and bilateral coarse crackles in the inferior lung zones, suggesting obstructive lung disease.

2. **Mechanism of Action**: The question specifies that the therapeutic regimen involves a medication that directly inhibits the effects of parasympathetic stimulation on bronchial smooth muscle. This is indicative of an anticholinergic medication because parasympathetic stimulation promotes bronchoconstriction.

3. **Options Analysis**:
   - **A: Fluticasone** - This is a corticosteroid that reduces inflammation in the airways but does not prima

Testing workflow: 100%|██████████████████████████████████████████████████████████▊| 1269/1273 [4:04:38<00:42, 10.63s/it]

-----CoT Output-------
To determine the correct answer, we should analyze the clinical presentation and the options provided.

1. **Patient Profile**: The child is 16 months old, non-ambulatory, exclusively breastfed, with growth metrics indicating underweight and underlength for age. The dental findings are significant, showing enamel erosion and caries, and skeletal issues such as frontal bossing, widening of the wrists, genu varum, and rib notching.

2. **Clinical Signs**:
   - **Frontal Bossing** often indicates issues related to bone development, such as rickets or a collagen disorder.
   - **Widening of the wrists and genu varum** points towards conditions affecting the bone, possibly due to nutritional deficiencies or bone mineralization problems.
   - **Rib Notching** could suggest a chronic bone condition, often associated with osteoclast activity or metabolic bone diseases.

3. **Nutritional Considerations**: The child is exclusively breastfed, raising the possibility of defi

Testing workflow: 100%|██████████████████████████████████████████████████████████▊| 1270/1273 [4:04:50<00:33, 11.14s/it]

-----CoT Output-------
To solve this question, we need to analyze the provided clinical details and symptoms to determine the most suitable diagnostic test for the suspected dermatological condition.

1. **Patient History**: The patient is a 21-year-old male with a five-day history of pruritus and a hypopigmented dermatosis on the upper torso, which started as a solitary macule and spread to the shoulders. He has had recent travel to the Bahamas, raising considerations for tropical or environmental exposures.

2. **Physical Presentation**: The condition is described as hypopigmented. This can indicate several dermatological issues, including infections (like tinea) or other dermatoses. The fact that it began as a macule that has spread is significant.

3. **Relevant Medical History**: The patient has well-managed type 1 diabetes, which could influence skin conditions due to immunosuppression, but he denies exposure to known melanocytotoxic agents, minimizing the likelihood of condition

Testing workflow: 100%|██████████████████████████████████████████████████████████▉| 1271/1273 [4:05:00<00:21, 10.90s/it]

-----CoT Output-------
To determine the correct answer, let's analyze the scenario and the implications of a third-degree perineal laceration:

1. **Understanding the Laceration**: A third-degree perineal laceration extends through the vaginal mucosa, perineal body, and into the anal sphincter complex. This type of injury can significantly disrupt the function of the anal sphincter, which is crucial for maintaining fecal continence.

2. **Potential Complications**: When considering complications following delivery with a third-degree laceration, we need to focus on how this injury can affect the woman's body:
   - **Urge incontinence (A)**: This generally relates to bladder issues, not directly caused by perineal lacerations.
   - **Vesicovaginal fistula (B)**: This is a potential complication following childbirth that involves a connection between the bladder and vagina, but it is more associated with fourth-degree lacerations and specific obstetric trauma.
   - **Fecal incontinence (

Testing workflow: 100%|██████████████████████████████████████████████████████████▉| 1272/1273 [4:05:17<00:12, 12.52s/it]

-----CoT Output-------
To determine the most probable etiological microorganism responsible for the patient's pulmonary condition, let’s analyze the provided information step-by-step.

1. **Patient Background:**
   - The patient is a 62-year-old male who recently experienced a cerebrovascular accident (stroke). He has been hospitalized for a week.

2. **Presenting Symptoms:**
   - He presents with pyrexia (fever) and a productive cough, producing purulent sputum.
   - Vital signs indicate he is stable, though he has an elevated temperature (38.4°C).
   - The presence of basal crackles localized to the right hemithorax suggests possible consolidation or fluid in the lung.

3. **Radiology Findings:**
   - Chest X-ray demonstrates new pulmonary consolidation in the right lung, which is indicative of pneumonia or at least an infectious process affecting the lung.

4. **Laboratory Analysis:**
   - Complete blood count shows leukocytosis with a predominance of neutrophils (72%), which sugges

Testing workflow: 100%|███████████████████████████████████████████████████████████| 1273/1273 [4:05:26<00:00, 11.57s/it]

-----CoT Output-------
To determine the correct option, we need to understand the mechanism of action of miglitol, which is an alpha-glucosidase inhibitor. This type of medication works by inhibiting enzymes that break down carbohydrates in the intestines, particularly those that hydrolyze glycosidic bonds in carbohydrates.

Now, let's analyze the options:
- **A: Phosphodiester bonds** - These bonds are found in nucleic acids (DNA/RNA) and are not relevant to carbohydrate digestion.
- **B: Glycosidic bonds** - These bonds are found between monosaccharides in carbohydrates and are the primary target of miglitol. Inhibiting enzymes that break down glycosidic bonds would slow down carbohydrate absorption.
- **C: Peptide bonds** - These bonds link amino acids in proteins, which are not relevant to the effects of miglitol.
- **D: Cystine bonds** - Cystine is formed by the oxidation of cysteine residues; these are not relevant in this context.
- **E: Hydrogen bonds** - While these bonds are 




In [None]:
sum([results_with_rephrasal['answer_idx'][i] == results_with_rephrasal['llm_answer'][i] for i in range(len(results_with_rephrasal['questions']))])/len(results_with_rephrasal['questions'])

0.6716417910447762

In [None]:
results_with_rephrasal = pd.DataFrame.from_dict(results_with_rephrasal)
results_with_rephrasal.to_csv('results_with_rephrasal_all.csv', index = None)

## Testing RAG + CoT Agents

In [None]:
rag_workflow = StateGraph(GraphState)

rag_workflow.add_node("cot_agent", cot_agent)
rag_workflow.add_node("rag_agent", rag_agent)
rag_workflow.add_node("faithfulness_agent", faithfulness_agent)

rag_workflow.set_entry_point("rag_agent")
rag_workflow.add_edge("rag_agent","cot_agent")
rag_workflow.add_edge("cot_agent","faithfulness_agent")
rag_workflow.add_edge("faithfulness_agent",END)



<langgraph.graph.state.StateGraph at 0x7f53896d9360>

In [None]:
results_with_rag = test_workflow(rag_workflow, test_set, test_type = 'rag_all',checkpoint_path= 'test_results_checkpoint_rag_all_7.csv')

Resuming from checkpoint: test_results_checkpoint_rag_all_7.csv


Testing workflow: 100%|█████████████████████████████████████████████████████████████| 384/384 [3:33:37<00:00, 33.38s/it]


In [None]:
sum([results_with_rag['answer_idx'][i] == results_with_rag['llm_answer'][i] for i in range(len(results_with_rag['questions']))])/len(results_with_rag['questions'])

0.6865671641791045

In [None]:
results_with_rag_df = pd.DataFrame.from_dict(results_with_rag)
results_with_rag_df.to_csv('results_with_rag_all.csv', index = None)

## Testing Re-write + RAG + CoT Agents

In [15]:
reph_rag_workflow = StateGraph(GraphState)


reph_rag_workflow.add_node("rephraser_agent", rephraser_agent)
reph_rag_workflow.add_node("cot_agent", cot_agent)
reph_rag_workflow.add_node("rag_agent", rag_agent)
reph_rag_workflow.add_node("faithfulness_agent", faithfulness_agent)

reph_rag_workflow.set_entry_point("rephraser_agent")
reph_rag_workflow.add_edge("rephraser_agent","rag_agent")
reph_rag_workflow.add_edge("rag_agent","cot_agent")
reph_rag_workflow.add_edge("cot_agent","faithfulness_agent")
reph_rag_workflow.add_edge("faithfulness_agent",END)



<langgraph.graph.state.StateGraph at 0x7f60cd8ebf70>

In [20]:
results_reph_rag = test_workflow(reph_rag_workflow, test_set,test_type =  'reph_rag_all', checkpoint_path='test_results_checkpoint_reph_rag_all_5.csv' )


Resuming from checkpoint: test_results_checkpoint_reph_rag_all_5.csv


Testing workflow: 100%|█████████████████████████████████████████████████████████████| 638/638 [7:25:56<00:00, 41.94s/it]


In [21]:
sum([results_reph_rag['answer_idx'][i] == results_reph_rag['llm_answer'][i] for i in range(len(results_reph_rag['questions']))])/len(results_reph_rag['questions'])

0.6834249803613511

In [22]:
results_reph_rag_df = pd.DataFrame.from_dict(results_reph_rag)
results_reph_rag_df.to_csv('results_reph_rag_all.csv', index = None)

# Analysing Results

## Preliminary Results

In [95]:
without_agents_partial = pd.read_csv('results_without_agents_partial.csv')
len(without_agents_partial[without_agents_partial['answer_idx'] == without_agents_partial['llm_answer']])/len(without_agents_partial)

0.58

In [98]:
cot_partial = pd.read_csv('results_with_cot_partial.csv')
len(cot_partial[cot_partial['answer_idx'] == cot_partial['llm_answer']])/len(cot_partial)

0.62

In [100]:
rewrite_partial = pd.read_csv('results_with_rephrasal_partial.csv')
len(rewrite_partial[rewrite_partial['answer_idx'] == rewrite_partial['llm_answer']])/len(rewrite_partial)

0.64

In [101]:
rag_partial = pd.read_csv('results_with_rag_semantic_partial.csv')
len(rag_partial[rag_partial['answer_idx'] == rag_partial['llm_answer']])/len(rag_partial)

0.64

In [104]:
whole_workflow_partial = pd.read_csv('results_reph_rag_partial.csv')
len(whole_workflow_partial[whole_workflow_partial['answer_idx'] == whole_workflow_partial['llm_answer']])/len(whole_workflow_partial)

0.7

## Final Results

In [96]:
without_agents = pd.read_csv('results_without_agents.csv')
without_agents

Unnamed: 0,questions,options,answer_idx,llm_answer
0,A 32-year-old man with HIV comes to the physic...,"{'A': 'Secretion of interferon-α', 'B': 'Inter...",B,A
1,A one-day-old male is evaluated in the hospita...,"{'A': 'Duodenal atresia', 'B': 'Intestinal mal...",A,A
2,A 38-year-old woman presents to the office for...,"{'A': 'Body dysmorphic disorder', 'B': 'Schizo...",C,C
3,A 47-year-old woman is brought to the emergenc...,"{'A': 'COPD exacerbation', 'B': 'Diabetic keto...",E,E
4,A 10-year-old boy is brought to the emergency ...,{'A': 'Hypoxanthine-guanine phosphoribosyl tra...,D,C
...,...,...,...,...
1268,A 16-month-old boy is brought to the physician...,{'A': 'Deficiency of cofactor for prolyl and l...,C,C
1269,A 21-year-old man comes to the physician becau...,"{'A': 'Wood lamp examination', 'B': 'Skin cult...",C,C
1270,"A 27-year-old woman, gravida 1, para 0, at 38 ...","{'A': 'Urge incontinence', 'B': 'Vesicovaginal...",C,C
1271,A 62-year-old patient has been hospitalized fo...,"{'A': 'Pseudomona aeruginosa', 'B': 'Streptoco...",E,A


In [97]:
len(without_agents[without_agents['answer_idx'] == without_agents['llm_answer']])/len(without_agents)

0.6897093479968578

In [71]:
cot = pd.read_csv('results_with_cot_all.csv')
cot

Unnamed: 0,questions,options,answer_idx,llm_answer
0,A 32-year-old man with HIV comes to the physic...,"{'A': 'Secretion of interferon-α', 'B': 'Inter...",B,B
1,A one-day-old male is evaluated in the hospita...,"{'A': 'Duodenal atresia', 'B': 'Intestinal mal...",A,A
2,A 38-year-old woman presents to the office for...,"{'A': 'Body dysmorphic disorder', 'B': 'Schizo...",C,C
3,A 47-year-old woman is brought to the emergenc...,"{'A': 'COPD exacerbation', 'B': 'Diabetic keto...",E,E
4,A 10-year-old boy is brought to the emergency ...,{'A': 'Hypoxanthine-guanine phosphoribosyl tra...,D,D
...,...,...,...,...
1268,A 16-month-old boy is brought to the physician...,{'A': 'Deficiency of cofactor for prolyl and l...,C,C
1269,A 21-year-old man comes to the physician becau...,"{'A': 'Wood lamp examination', 'B': 'Skin cult...",C,C
1270,"A 27-year-old woman, gravida 1, para 0, at 38 ...","{'A': 'Urge incontinence', 'B': 'Vesicovaginal...",C,C
1271,A 62-year-old patient has been hospitalized fo...,"{'A': 'Pseudomona aeruginosa', 'B': 'Streptoco...",E,A


In [72]:
len(cot[cot['answer_idx'] == cot['llm_answer']])/len(cot)

0.7195600942655145

In [73]:
len(cot[cot['llm_answer'].str.len() > 1])

30

In [74]:
rewrite = pd.read_csv('results_with_rephrasal_all.csv')
rewrite

Unnamed: 0,questions,options,answer_idx,llm_answer
0,A 32-year-old man with HIV comes to the physic...,"{'A': 'Secretion of interferon-α', 'B': 'Inter...",B,B
1,A one-day-old male is evaluated in the hospita...,"{'A': 'Duodenal atresia', 'B': 'Intestinal mal...",A,A
2,A 38-year-old woman presents to the office for...,"{'A': 'Body dysmorphic disorder', 'B': 'Schizo...",C,C
3,A 47-year-old woman is brought to the emergenc...,"{'A': 'COPD exacerbation', 'B': 'Diabetic keto...",E,E
4,A 10-year-old boy is brought to the emergency ...,{'A': 'Hypoxanthine-guanine phosphoribosyl tra...,D,C \nQuestion: Which specific biochemical path...
...,...,...,...,...
1268,A 16-month-old boy is brought to the physician...,{'A': 'Deficiency of cofactor for prolyl and l...,C,C
1269,A 21-year-old man comes to the physician becau...,"{'A': 'Wood lamp examination', 'B': 'Skin cult...",C,C
1270,"A 27-year-old woman, gravida 1, para 0, at 38 ...","{'A': 'Urge incontinence', 'B': 'Vesicovaginal...",C,C
1271,A 62-year-old patient has been hospitalized fo...,"{'A': 'Pseudomona aeruginosa', 'B': 'Streptoco...",E,B


In [75]:
len(rewrite[rewrite['llm_answer'].str.len() > 1])

47

In [76]:
len(rewrite[rewrite['answer_idx'] == rewrite['llm_answer']])/len(rewrite)

0.6716417910447762

In [77]:
rag = pd.read_csv('results_with_rag_all.csv')
rag

Unnamed: 0,questions,options,answer_idx,llm_answer,cot_output,faithfulness,rag_information
0,A 32-year-old man with HIV comes to the physic...,"{'A': 'Secretion of interferon-α', 'B': 'Inter...",B,B,"To answer the question, we need to identify th...",0.222222,**Relevant Segments**:\n\n1. **Document on Tub...
1,A one-day-old male is evaluated in the hospita...,"{'A': 'Duodenal atresia', 'B': 'Intestinal mal...",A,A,To determine the most likely etiology of the o...,0.133333,Documents relevant to the question about the o...
2,A 38-year-old woman presents to the office for...,"{'A': 'Body dysmorphic disorder', 'B': 'Schizo...",C,C,To determine the most consistent diagnosis ali...,0.333333,Here are the relevant segments extracted from ...
3,A 47-year-old woman is brought to the emergenc...,"{'A': 'COPD exacerbation', 'B': 'Diabetic keto...",E,E,To determine which option could explain the pa...,0.533333,1. **Hypoventilation**: This results in elevat...
4,A 10-year-old boy is brought to the emergency ...,{'A': 'Hypoxanthine-guanine phosphoribosyl tra...,D,D,To determine the correct option regarding the ...,0.400000,Here are the useful segments extracted in resp...
...,...,...,...,...,...,...,...
1268,A 16-month-old boy is brought to the physician...,{'A': 'Deficiency of cofactor for prolyl and l...,C,C,To determine the most likely underlying cause ...,0.916667,The relevant segments addressing the case of t...
1269,A 21-year-old man comes to the physician becau...,"{'A': 'Wood lamp examination', 'B': 'Skin cult...",C,D,To determine the correct option for confirming...,0.111111,The relevant segments that are useful to the q...
1270,"A 27-year-old woman, gravida 1, para 0, at 38 ...","{'A': 'Urge incontinence', 'B': 'Vesicovaginal...",C,C,To determine the correct answer to the questio...,0.100000,**Relevant Extracts:**\n\n1. **Document 111**:...
1271,A 62-year-old patient has been hospitalized fo...,"{'A': 'Pseudomona aeruginosa', 'B': 'Streptoco...",E,B,To determine the most likely causal microorgan...,0.000000,Here are the segments from the document that a...


In [78]:
len(rag[rag['answer_idx'] == rag['llm_answer']])/len(rag)

0.6865671641791045

In [87]:
rag['faithfulness'].mean()

0.356583141131353

In [89]:
rag[rag['answer_idx'] == rag['llm_answer']]['faithfulness'].mean()

0.3607419615230313

In [90]:
rag[rag['answer_idx'] != rag['llm_answer']]['faithfulness'].mean()

0.3474733440829148

In [82]:
len(rag[rag['llm_answer'].str.len() > 1])

35

In [83]:
whole_workflow = pd.read_csv('results_reph_rag_all.csv')
whole_workflow

Unnamed: 0,questions,options,answer_idx,llm_answer,cot_output,faithfulness,rag_information
0,A 32-year-old man with HIV comes to the physic...,"{'A': 'Secretion of interferon-α', 'B': 'Inter...",B,B,"To answer the question, we need to analyze the...",0.250000,The relevant segment of the document is as fol...
1,A one-day-old male is evaluated in the hospita...,"{'A': 'Duodenal atresia', 'B': 'Intestinal mal...",A,A** (Duodenal atresia).,To determine the most plausible underlying eti...,0.272727,The useful segments related to the question re...
2,A 38-year-old woman presents to the office for...,"{'A': 'Body dysmorphic disorder', 'B': 'Schizo...",C,C,"To find the correct answer, let's analyze the ...",1.000000,"**Relevant segments:**\n\n- ""The first impress..."
3,A 47-year-old woman is brought to the emergenc...,"{'A': 'COPD exacerbation', 'B': 'Diabetic keto...",E,E,To determine the correct option based on the c...,0.466667,**Key segments related to the patient's presen...
4,A 10-year-old boy is brought to the emergency ...,{'A': 'Hypoxanthine-guanine phosphoribosyl tra...,D,D,"To determine the correct option, we’ll analyze...",0.727273,The useful segments from the provided extract ...
...,...,...,...,...,...,...,...
1268,A 16-month-old boy is brought to the physician...,{'A': 'Deficiency of cofactor for prolyl and l...,C,C,To determine the underlying etiology contribut...,0.923077,The segments that are useful to the question i...
1269,A 21-year-old man comes to the physician becau...,"{'A': 'Wood lamp examination', 'B': 'Skin cult...",C,D,To determine the most appropriate diagnostic t...,0.090909,**Relevant Extract:** \n\n- **Pityriasis rosea...
1270,"A 27-year-old woman, gravida 1, para 0, at 38 ...","{'A': 'Urge incontinence', 'B': 'Vesicovaginal...",C,C,To determine the correct answer regarding the ...,0.333333,The relevant segments from the documents regar...
1271,A 62-year-old patient has been hospitalized fo...,"{'A': 'Pseudomona aeruginosa', 'B': 'Streptoco...",E,B,To determine the most probable etiological pat...,0.000000,The relevant extracted portion discussing pneu...


In [84]:
len(whole_workflow[whole_workflow['answer_idx'] == whole_workflow['llm_answer']])/len(whole_workflow)

0.6834249803613511

In [91]:
whole_workflow['faithfulness'].mean()

0.3367134265204726

In [92]:
whole_workflow[whole_workflow['answer_idx'] == whole_workflow['llm_answer']]['faithfulness'].mean()

0.34823209017365586

In [93]:
whole_workflow[whole_workflow['answer_idx'] != whole_workflow['llm_answer']]['faithfulness'].mean()

0.3118468325297295

In [53]:
len(whole_workflow[whole_workflow['llm_answer'].str.len() > 1])

35