In this notebook we are using OpenAI GPT-4 for evaluation on MedQA dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
OPENAI_API_KEY = ""

In [3]:
from benchmarks.medqa.medqa_dataset_builder import MedQADatasetBuilder

dataset = MedQADatasetBuilder().build_dataset(split_type="validation")

In [4]:
example_data_point = next(dataset.__iter__())

In [5]:
example_data_point

MedQADataPoint(input=MedQAInput(question='A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?', options=['Chloramphenicol', 'Gentamicin', 'Ciprofloxacin', 'Ceftriaxone', 'Trimethoprim']), expected_output='(D)', metadata=None)

In [6]:
from benchmarks.medqa.medqa_prompt_template import MedQAPromptTemplate, \
    AnswerWithExplanation
from benchmarks.medqa.medqa_dataset_builder import MedQAInput

def input_adapter(medqa_input: MedQAInput):
    prompt_template = MedQAPromptTemplate()


    return prompt_template.format(
        question=medqa_input.question,
        options=medqa_input.options
    )

In [7]:
def output_adapter(output_json: str) -> AnswerWithExplanation:
    parsed_output = AnswerWithExplanation.model_validate_json(output_json)

    return parsed_output

In [8]:
from llms.openai_caller import OpenAI
from medplexity.chains.evaluation_adapter_chain import EvaluationAdapterChain

chain = EvaluationAdapterChain(
    llm=OpenAI(
        api_token=OPENAI_API_KEY
    ),
    input_adapter=input_adapter,
    output_adapter=output_adapter,
)

In [9]:
def comparator(expected_output: str, predicted_output: AnswerWithExplanation):

    return expected_output == predicted_output.answer

In [10]:
from medplexity.evaluators.sequential_evaluator import SequentialEvaluator

evaluator = SequentialEvaluator(
    chain=chain,
    comparator=comparator
)

In [11]:
dataset[0].input

MedQAInput(question='A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?', options=['Chloramphenicol', 'Gentamicin', 'Ciprofloxacin', 'Ceftriaxone', 'Trimethoprim'])

In [12]:
evaluation = evaluator.evaluate(dataset[0:1])

100%|██████████| 1/1 [00:04<00:00,  4.48s/it]


In [13]:
evaluation.accuracy()

1.0

In [14]:
correct, incorrect = evaluation.partition_by_correctness()

In [15]:
correct

[EvaluationResult(input=MedQAInput(question='A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?', options=['Chloramphenicol', 'Gentamicin', 'Ciprofloxacin', 'Ceftriaxone', 'Trimethoprim']), input_metadata=None, expected_output='(D)', output=AnswerWithExplanation(answer='(D)', explanation='Let’s solve this step-by-step, referring to authoritative sources as needed. The bacteria that does not ferment maltose and has no polysaccharide capsule is Neisseria gonorrhoeae. The medication given that blocks cell wall synthesis is Ceftriaxone.'), correct=True)]

In [16]:
incorrect

[]