In this notebook we show how to run evaluations on different multiple-choice question dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

/Users/maksympetyak/PycharmProjects/medplexity


In [3]:
import uuid

# Parameters for evaluation
MODEL = "gpt-4"
DATASET = "medmcqa"
SPLIT_TYPE="validation"
EXPERIMENT_ID = uuid.uuid4()
EVALUATION_FILE_PATH = f"{MODEL}-{DATASET}-{SPLIT_TYPE}-{EXPERIMENT_ID}"

Create harness with the specified parameters. Make sure to pass the API token for your LLM.

In [4]:
from medplexity.llms.openai_caller import OpenAI
from medplexity.benchmarks.multiple_choice_utils import load_example_questions_from_json
from medplexity.benchmarks.medmcqa import MedMCQADatasetBuilder
from medplexity.chains.multiple_choice_question_chain import MultipleChoiceEvaluationChain
from medplexity.prompts.medical_assistant_prompt_template import MedicalAssistantPromptTemplate
from medplexity.benchmarks.dataset_factory import DatasetFactory
from medplexity.medharness import Medharness

harness = Medharness(
    dataset=DatasetFactory().build(DATASET, SPLIT_TYPE),
    chain=MultipleChoiceEvaluationChain(
        llm=OpenAI(
            model=MODEL,
            temperature=0,
        ),
        examples=load_example_questions_from_json(MedMCQADatasetBuilder().EXAMPLE_QUESTIONS_PATH),
        save_prompt=True
    ),
)

In [5]:
harness.run(k=50, ignore_errors=True)

100%|██████████| 50/50 [08:18<00:00,  9.98s/it]


EvaluationSummary(evaluation_results=[{
    "input": {
        "question": "Which of the following is not true for myelinated nerve fibers:",
        "options": [
            "Impulse through myelinated fibers is slower than non-myelinated fibers",
            "Membrane currents are generated at nodes of Ranvier",
            "Saltatory conduction of impulses is seen",
            "Local anesthesia is effective only when the nerve is not covered by myelin sheath"
        ],
        "context": null,
        "examples": null
    },
    "input_metadata": {
        "explanation": null,
        "subject_name": "Physiology"
    },
    "expected_output": "(A)",
    "output": "(A)",
    "output_metadata": {
        "explanation": "Let’s solve this step-by-step, referring to authoritative sources as needed. Myelinated nerve fibers are characterized by faster impulse conduction compared to non-myelinated fibers due to the presence of myelin sheath which allows for saltatory conduction. Membrane 

In [6]:
harness.result

EvaluationSummary(evaluation_results=[{
    "input": {
        "question": "Which of the following is not true for myelinated nerve fibers:",
        "options": [
            "Impulse through myelinated fibers is slower than non-myelinated fibers",
            "Membrane currents are generated at nodes of Ranvier",
            "Saltatory conduction of impulses is seen",
            "Local anesthesia is effective only when the nerve is not covered by myelin sheath"
        ],
        "context": null,
        "examples": null
    },
    "input_metadata": {
        "explanation": null,
        "subject_name": "Physiology"
    },
    "expected_output": "(A)",
    "output": "(A)",
    "output_metadata": {
        "explanation": "Let’s solve this step-by-step, referring to authoritative sources as needed. Myelinated nerve fibers are characterized by faster impulse conduction compared to non-myelinated fibers due to the presence of myelin sheath which allows for saltatory conduction. Membrane 

We can export the results of evaluation for the visualiser.

In [7]:
from datetime import datetime

harness.save_results(
    EVALUATION_FILE_PATH,
    additional_data={
        "prompt_template": MedicalAssistantPromptTemplate.PROMPT,
        "date": datetime.now().date().strftime("%d-%m-%Y"),
    }
)

Permanent storage for visualisation

In [8]:
from medplexity.storage.supabase_client import SupabaseEvaluationSaver

evaluation_saver = SupabaseEvaluationSaver()

evaluation_saver.save_evaluation(
    file_name=EVALUATION_FILE_PATH,
    model=MODEL,
    benchmark_id=DATASET,
    split_type=SPLIT_TYPE,
)

2023-11-15 15:52:50,407:INFO - HTTP Request: GET https://fqmgogeamrlfnziacygu.supabase.co/rest/v1/benchmarks?select=id&id=eq.medmcqa "HTTP/1.1 200 OK"
2023-11-15 15:52:50,612:INFO - HTTP Request: GET https://fqmgogeamrlfnziacygu.supabase.co/rest/v1/dataset_configs?select=%2A&benchmark=eq.medmcqa&split_type=eq.validation&subtype=is.null "HTTP/1.1 200 OK"
2023-11-15 15:52:50,718:INFO - HTTP Request: GET https://fqmgogeamrlfnziacygu.supabase.co/rest/v1/dataset_configs?select=%2A&benchmark=eq.medmcqa&split_type=eq.validation&subtype=is.null "HTTP/1.1 200 OK"
2023-11-15 15:52:51,144:INFO - HTTP Request: POST https://fqmgogeamrlfnziacygu.supabase.co/storage/v1/object/EvalRuns/gpt-4-medmcqa-validation-95b55d5b-b480-499a-8932-a77b562f8313 "HTTP/1.1 200 OK"
2023-11-15 15:52:51,214:INFO - HTTP Request: GET https://fqmgogeamrlfnziacygu.supabase.co/rest/v1/evaluations?select=%2A&dataset_config=eq.f64c5a62-a12c-4da3-b5d4-df1e31acaf52&model=eq.gpt-4 "HTTP/1.1 200 OK"
2023-11-15 15:52:51,285:INFO - H