In this notebook we are using OpenAI GPT-4 for evaluation on MedQA dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
OPENAI_API_KEY = ""

In [3]:
from benchmarks.medqa.medqa_dataset_builder import MedQADatasetBuilder

dataset = MedQADatasetBuilder().build_dataset(split_type="validation")

In [4]:
example_data_point = next(dataset.__iter__())

In [5]:
example_data_point

MedQADataPoint(input=MedQAInput(question='A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?', options=['Chloramphenicol', 'Gentamicin', 'Ciprofloxacin', 'Ceftriaxone', 'Trimethoprim']), expected_output='(D)', metadata=None)

In [6]:
from benchmarks.medqa.medqa_prompt_template import MedQAPromptTemplate
from benchmarks.medqa.medqa_dataset_builder import MedQAInput

def input_adapter(medqa_input: MedQAInput):
    prompt_template = MedQAPromptTemplate()

    return prompt_template.format(
        question=medqa_input.question,
        options=medqa_input.options
    )

In [7]:
input_adapter(example_data_point.input)

'The following are multiple choice questions about medical knowledge. Solve them in a step-by-step fashion, starting by summarizing the available information. Output a JSON with the answer (give back only the letter) and an explanation for it.\nQuestion: A 22-year-old male marathon runner presents to the office with the complaint of right-sided rib pain when he runs long distances. Physical examination reveals normal heart and lung findings and an exhalation dysfunction at ribs 4-5 on the right. Which of the following muscles or muscle groups will be most useful in correcting this dysfunction utilizing a direct method?\n (A) anterior scalene (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nOutput: {"answer":"(C)","explanation":"Let’s solve this step-by-step, referring to authoritative sources as needed. Among the options, only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th ribs."}\n\n\nQuestion: A 36-year-old male presents to the office with 

In [8]:
from benchmarks.multiple_choice_utils import AnswerWithExplanation


def output_adapter(output_json: str) -> AnswerWithExplanation:
    parsed_output = AnswerWithExplanation.model_validate_json(output_json)

    return parsed_output

In [9]:
from llms.openai_caller import OpenAI
from medplexity.chains.evaluation_adapter_chain import EvaluationAdapterChain

chain = EvaluationAdapterChain(
    llm=OpenAI(
        api_token=OPENAI_API_KEY
    ),
    input_adapter=input_adapter,
    output_adapter=output_adapter,
)

In [10]:
def comparator(expected_output: str, predicted_output: AnswerWithExplanation):

    return expected_output == predicted_output.answer

In [11]:
from medplexity.evaluators.sequential_evaluator import SequentialEvaluator

evaluator = SequentialEvaluator(
    chain=chain,
    comparator=comparator
)

In [12]:
dataset[0].input

MedQAInput(question='A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?', options=['Chloramphenicol', 'Gentamicin', 'Ciprofloxacin', 'Ceftriaxone', 'Trimethoprim'])

In [13]:
evaluation = evaluator.evaluate(dataset[0:1])

100%|██████████| 1/1 [00:04<00:00,  4.01s/it]


In [14]:
evaluation.accuracy()

1.0

In [15]:
correct, incorrect = evaluation.partition_by_correctness()

In [16]:
correct

[EvaluationResult(input=MedQAInput(question='A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?', options=['Chloramphenicol', 'Gentamicin', 'Ciprofloxacin', 'Ceftriaxone', 'Trimethoprim']), input_metadata=None, expected_output='(D)', output=AnswerWithExplanation(answer='(D)', explanation='Let’s solve this step-by-step, referring to authoritative sources as needed. The bacteria that does not ferment maltose and has no polysaccharide capsule is Neisseria gonorrhoeae. The medication given to block cell wall synthesis is Ceftriaxone.'), correct=True)]

In [17]:
incorrect

[]