In this notebook we show how to run evaluations on different multiple-choice question dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Parameters for evaluation
MODEL = "gpt-4"
DATASET = "medmcqa"
SPLIT_TYPE="validation"
EVALUATION_FILE_PATH = "medmcqa-gpt-4-validation"

Create harness with the specified parameters. Make sure to pass the API token for your LLM.

In [3]:
from medplexity.llms.openai_caller import OpenAI
from medplexity.benchmarks.multiple_choice_utils import load_example_questions_from_json
from medplexity.benchmarks.medmcqa import MedMCQADatasetBuilder
from medplexity.chains.multiple_choice_question_chain import MultipleChoiceEvaluationChain
from medplexity.prompts.medical_assistant_prompt_template import MedicalAssistantPromptTemplate
from medplexity.benchmarks.dataset_factory import DatasetFactory
from medplexity.medharness import Medharness

harness = Medharness(
    dataset=DatasetFactory().build(DATASET, SPLIT_TYPE),
    chain=MultipleChoiceEvaluationChain(
        llm=OpenAI(
            model=MODEL,
            api_token="",
            temperature=0,
        ),
        examples=load_example_questions_from_json(MedMCQADatasetBuilder().EXAMPLE_QUESTIONS_PATH),
        save_prompt=True
    ),
)

In [4]:
harness.run(k=2, ignore_errors=True)

100%|██████████| 2/2 [00:48<00:00, 24.11s/it]


EvaluationSummary(evaluation_results=[{
    "input": {
        "question": "Which of the following is not true for myelinated nerve fibers:",
        "options": [
            "Impulse through myelinated fibers is slower than non-myelinated fibers",
            "Membrane currents are generated at nodes of Ranvier",
            "Saltatory conduction of impulses is seen",
            "Local anesthesia is effective only when the nerve is not covered by myelin sheath"
        ],
        "context": null,
        "examples": null
    },
    "input_metadata": {
        "explanation": null,
        "subject_name": "Physiology"
    },
    "expected_output": "(A)",
    "output": "(A)",
    "output_metadata": {
        "explanation": "Let’s solve this step-by-step, referring to authoritative sources as needed. Myelinated nerve fibers are covered by a myelin sheath, which allows for faster transmission of nerve impulses compared to non-myelinated fibers. This is due to the fact that the impulse \"j

In [5]:
harness.result

EvaluationSummary(evaluation_results=[{
    "input": {
        "question": "Which of the following is not true for myelinated nerve fibers:",
        "options": [
            "Impulse through myelinated fibers is slower than non-myelinated fibers",
            "Membrane currents are generated at nodes of Ranvier",
            "Saltatory conduction of impulses is seen",
            "Local anesthesia is effective only when the nerve is not covered by myelin sheath"
        ],
        "context": null,
        "examples": null
    },
    "input_metadata": {
        "explanation": null,
        "subject_name": "Physiology"
    },
    "expected_output": "(A)",
    "output": "(A)",
    "output_metadata": {
        "explanation": "Let’s solve this step-by-step, referring to authoritative sources as needed. Myelinated nerve fibers are covered by a myelin sheath, which allows for faster transmission of nerve impulses compared to non-myelinated fibers. This is due to the fact that the impulse \"j

We can export the results of evaluation for the visualiser.

In [6]:
from datetime import datetime

harness.save_results(
    EVALUATION_FILE_PATH,
    additional_data={
        "prompt_template": MedicalAssistantPromptTemplate.PROMPT,
        "date": datetime.now().date().strftime("%d-%m-%Y"),
    }
)