In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from medplexity.benchmarks.medqa.medqa_dataset_builder import MedQADatasetBuilder

In [14]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [15]:
dataset = MedQADatasetBuilder().build_dataset()

In [16]:
dataset.data_points[8]

MedQADataPoint(input=MedQAInput(question='A 62-year-old woman presents for a regular check-up. She complains of lightheadedness and palpitations which occur episodically. Past medical history is significant for a myocardial infarction 6 months ago and NYHA class II chronic heart failure. She also was diagnosed with grade I arterial hypertension 4 years ago. Current medications are aspirin 81 mg, atorvastatin 10 mg, enalapril 10 mg, and metoprolol 200 mg daily. Her vital signs are a blood pressure of 135/90 mm Hg, a heart rate of 125/min, a respiratory rate of 14/min, and a temperature of 36.5°C (97.7°F). Cardiopulmonary examination is significant for irregular heart rhythm and decreased S1 intensity. ECG is obtained and is shown in the picture (see image). Echocardiography shows a left ventricular ejection fraction of 39%. Which of the following drugs is the best choice for rate control in this patient?', options=['Atenolol', 'Verapamil', 'Diltiazem', 'Propafenone', 'Digoxin']), expect

In [17]:
from medplexity.benchmarks.medqa.medqa_prompt_template import MedQAPromptTemplate
from medplexity.benchmarks.medqa.medqa_dataset_builder import MedQAInput


def input_adapter(medqa_input: MedQAInput):
    prompt_template = MedQAPromptTemplate()

    return prompt_template.format(
        question=medqa_input.question,
        options=medqa_input.options
    )

In [18]:
from medplexity.benchmarks.multiple_choice_utils import AnswerWithExplanation


def output_adapter(output_json: str) -> AnswerWithExplanation:
    parsed_output = AnswerWithExplanation.model_validate_json(output_json)

    return parsed_output

In [19]:
from medplexity.llms.openai_caller import OpenAI
from medplexity.chains.evaluation_adapter_chain import EvaluationAdapterChain

chain = EvaluationAdapterChain(
    llm=OpenAI(),
    input_adapter=input_adapter,
    output_adapter=output_adapter,
)


In [20]:
def comparator(expected_output: str, predicted_output: AnswerWithExplanation):

    return expected_output == predicted_output.answer

In [21]:
from medplexity.evaluators.sequential_evaluator import SequentialEvaluator

evaluator = SequentialEvaluator(
    chain=chain,
    comparator=comparator
)


In [22]:
evaluation = evaluator.evaluate(dataset[0:1])

100%|██████████| 1/1 [00:03<00:00,  3.38s/it]

chain_output: answer='(E)' explanation='Let’s solve this step-by-step, referring to authoritative sources as needed. The patient is pregnant and has symptoms of a urinary tract infection (burning upon urination). Nitrofurantoin is considered safe for use in pregnancy and is the best treatment option for this patient.'
type of chain_output: <class 'benchmarks.multiple_choice_utils.AnswerWithExplanation'>
chain_output after update answer='(E)' explanation='Let’s solve this step-by-step, referring to authoritative sources as needed. The patient is pregnant and has symptoms of a urinary tract infection (burning upon urination). Nitrofurantoin is considered safe for use in pregnancy and is the best treatment option for this patient.'





In [23]:
print(evaluation.evaluation_results[0].model_dump_json(indent=4))

{
    "input": {
        "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?",
        "options": [
            "Ampicillin",
            "Ceftriaxone",
            "Ciprofloxacin",
            "Doxycycline",
            "Nitrofurantoin"
        ]
    },
    "input_metadata": null,
    "expected_output": "(E)",
    "output": {
        "answer": "(E)",
        "explanation": "Let’s solve this step-by-step, referring to authoritative sources as needed. 

## Prediction with Langchain

In [24]:
from langchain.agents import initialize_agent, AgentType, Tool
from langchain.chat_models import ChatOpenAI
from langchain.utilities import SerpAPIWrapper

In [25]:
os.environ['SERPAPI_API_KEY'] = ''

In [26]:
llm = ChatOpenAI(temperature=0, model="gpt-4")
search = SerpAPIWrapper()

In [27]:
tools = [
    Tool(
        name="search",
        func=search.run,
        description="Get additional medical information. You should ask targeted questions."
    ),
]

In [28]:
agent_executor = initialize_agent(
    tools,
    llm,
    agent=AgentType.OPENAI_FUNCTIONS,
    return_intermediate_steps=True,
    verbose=True,
)

In [29]:
from medplexity.chains.chain import ChainOutput

def chain_output_adapter(chain_output: ChainOutput) -> ChainOutput:
    parsed_output = AnswerWithExplanation.model_validate_json(chain_output.output)

    return ChainOutput(
        output=parsed_output,
        output_metadata=chain_output.output_metadata,
    )

In [34]:
def comparator(expected_output: str, predicted_output: AnswerWithExplanation):
    return expected_output == predicted_output.answer

In [35]:
from medplexity.chains.langchain_wrapper import LangchainWrapper

chain = EvaluationAdapterChain(
    chain=LangchainWrapper(agent_executor, store_intermediate_steps=True),
    input_adapter=input_adapter,
    output_adapter=chain_output_adapter,
)

In [36]:
evaluator = SequentialEvaluator(
    chain=chain,
    comparator=comparator
)

In [37]:
# Try one item for evaluation
evaluation = evaluator.evaluate(dataset[:1])

  0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search` with `best treatment for urinary tract infection in pregnant women`


[0m[36;1m[1;3m[4] Antibiotics commonly used include amoxicillin, ampicillin, cephalosporins, nitrofurantoin, and trimethoprim-sulfamethoxazole. Fluoroquinolones are not recommended as a first-line treatment in pregnancy due to conflicting studies regarding teratogenicity.[0m

100%|██████████| 1/1 [00:10<00:00, 10.46s/it]

[32;1m[1;3m{"answer":"(E)","explanation":"Let’s solve this step-by-step, referring to authoritative sources as needed. The patient's symptoms suggest a urinary tract infection. Among the options, Nitrofurantoin is commonly used to treat urinary tract infections in pregnant women. Fluoroquinolones like Ciprofloxacin are not recommended as a first-line treatment in pregnancy due to conflicting studies regarding teratogenicity."}[0m

[1m> Finished chain.[0m
chain_output: output=AnswerWithExplanation(answer='(E)', explanation="Let’s solve this step-by-step, referring to authoritative sources as needed. The patient's symptoms suggest a urinary tract infection. Among the options, Nitrofurantoin is commonly used to treat urinary tract infections in pregnant women. Fluoroquinolones like Ciprofloxacin are not recommended as a first-line treatment in pregnancy due to conflicting studies regarding teratogenicity.") output_metadata={<LangChainOutputDictKeys.intermediate_steps: 'intermediate_s




In [38]:
example_output = evaluation.evaluation_results

In [43]:
print(evaluation.evaluation_results[0].output)
print(evaluation.evaluation_results[0].output_metadata)

answer='(E)' explanation="Let’s solve this step-by-step, referring to authoritative sources as needed. The patient's symptoms suggest a urinary tract infection. Among the options, Nitrofurantoin is commonly used to treat urinary tract infections in pregnant women. Fluoroquinolones like Ciprofloxacin are not recommended as a first-line treatment in pregnancy due to conflicting studies regarding teratogenicity."
{<LangChainOutputDictKeys.intermediate_steps: 'intermediate_steps'>: [(AgentActionMessageLog(tool='search', tool_input='best treatment for urinary tract infection in pregnant women', log='\nInvoking: `search` with `best treatment for urinary tract infection in pregnant women`\n\n\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'search', 'arguments': '{\n  "__arg1": "best treatment for urinary tract infection in pregnant women"\n}'}})]), '[4] Antibiotics commonly used include amoxicillin, ampicillin, cephalosporins, nitrofurantoin, and trimethop