In this notebook we are using Llama-2 hosted on deepinfra (https://deepinfra.com/) for evaluation on MedMCQA dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from medplexity.benchmarks.medmcqa.medmcqa_loader import MedMCQALoader
from medplexity.benchmarks.medmcqa.medmcqa_dataset_builder import MedMCQADatasetBuilder
from medplexity.benchmarks.medmcqa.medmcqa_prompt_template import MedMCQAPromptTemplate


In [3]:
DEEPINFRA_API_KEY = ""

In [4]:
loader = MedMCQALoader()

In [5]:
dataset = MedMCQADatasetBuilder().build_dataset("validation")

In [6]:
example_data_point = next(dataset.__iter__())

In [7]:
example_data_point

MedMCQADataPoint(input=MedMCQAInput(question='Which of the following is not true for myelinated nerve fibers:', options=['Impulse through myelinated fibers is slower than non-myelinated fibers', 'Membrane currents are generated at nodes of Ranvier', 'Saltatory conduction of impulses is seen', 'Local anesthesia is effective only when the nerve is not covered by myelin sheath']), expected_output=0, metadata=MedMCQAOutputMetadata(explanation=None, subject_name='Physiology'))

In [8]:
from medplexity.benchmarks.medmcqa.medmcqa_dataset_builder import MedMCQAInput

def input_adapter(medmcqa_input: MedMCQAInput):
    prompt_template = MedMCQAPromptTemplate()

    sys_prompt = '<<SYS>>Always output a JSON of the format {"answer": "(A) | (B) | (C) | (D)", "explanation": "text explaining the choice"}<</SYS>> \n'

    instructions = "[INST]" + prompt_template.format(
        question=medmcqa_input.question,
        options=medmcqa_input.options
    ) + "[/INST]"

    return sys_prompt + instructions

In [9]:
import re
from medplexity.benchmarks.medmcqa.medmcqa_prompt_template import AnswerWithExplanation


def extract_option(s):
    options = re.findall(r'\((A|B|C|D)\)', s)
    if len(options) > 1:
        raise ValueError("More than one option found!")
    elif options:
        return '(' + options[0] + ')'
    else:
        raise ValueError("No option provided in the answer")



def output_adapter(output_json: str) -> AnswerWithExplanation:
    parsed_output = AnswerWithExplanation.model_validate_json(output_json)

    # sometimes in addition the letter it returns also an explanation, so here we just extract the relevant letter
    parsed_output.answer = extract_option(parsed_output.answer)

    return parsed_output

In [10]:
from medplexity.llms.deepinfra import Deepinfra
from medplexity.chains.evaluation_adapter_chain import EvaluationAdapterChain

chain = EvaluationAdapterChain(
    llm=Deepinfra(
        api_token=DEEPINFRA_API_KEY
    ),
    input_adapter=input_adapter,
    output_adapter=output_adapter,
)

In [11]:
def comparator(expected_output: int, predicted_output: AnswerWithExplanation):
    letter_to_idx = { "(A)" : 0, "(B)": 1, "(C)": 2, "(D)": 3 }
    predicted_idx =  letter_to_idx[predicted_output.answer]

    return expected_output == predicted_idx

In [12]:
from medplexity.evaluators.sequential_evaluator import SequentialEvaluator

evaluator = SequentialEvaluator(
    chain=chain,
    comparator=comparator
)

In [13]:
dataset[0].input

MedMCQAInput(question='Which of the following is not true for myelinated nerve fibers:', options=['Impulse through myelinated fibers is slower than non-myelinated fibers', 'Membrane currents are generated at nodes of Ranvier', 'Saltatory conduction of impulses is seen', 'Local anesthesia is effective only when the nerve is not covered by myelin sheath'])

In [14]:
evaluation = evaluator.evaluate(dataset[10:13])

100%|██████████| 3/3 [00:23<00:00,  7.85s/it]


In [15]:
evaluation.accuracy()

0.6666666666666666

In [16]:
correct, incorrect = evaluation.partition_by_correctness()

In [17]:
incorrect

[EvaluationResult(input=MedMCQAInput(question='A second-year PG resident tells you to perform an ABG of a patient. All of the following are true about performing an ABG except:', options=['Before performing the ABG, syringe should be loaded with 0.3 cc of heparin', 'Normal pH, HCO. and PCO, levels may not indicate absence of an acid-base imbalance', "A different site should be tried i f modified Allen's test is negative", 'Radial aery is the preferred site']), input_metadata=MedMCQAOutputMetadata(explanation='Ans: A. Before performing the ABG, syringe should be loaded with 0.3 cc of heparin(Ref: Harrison 18/e p364; http:// emedicine.medscape.comlaiclell 902703-overview).Care should be taken when measuring blood gases to obtain the aerial blood sample without using excessive heparin.Heparin should be expelled from the syringe after loading as it may lead to false pCO2 readings.Precautions:Most syringes come pre-packaged & contain a small amount of heparin, to prevent coagulation.Other s