# Sistemas inteligentes para respostas a perguntas médicas

Gyovana M. Moriyama (216190)

Rafael A. Matumoto (273085)

In [1]:
!pip install -qU langchain_openai langchain-community  openai datasets pydantic

In [2]:
import json
import time
import re
import unicodedata
import datetime
import os
import random
import numpy as np
import pandas as pd
from pydantic import BaseModel
from typing import List, Optional, Literal
from pydantic import BaseModel, Field
from google.colab import userdata, drive
from matplotlib import pyplot as plt
from tqdm import tqdm
from datasets import load_dataset
from openai import OpenAI
from langchain_openai import ChatOpenAI

from langchain_core.prompts import ChatPromptTemplate

random.seed(42)

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

#Dataset MedQA-USMLE-4-options

In [5]:
#Loads the dataset with the test questions
data = load_dataset('GBaker/MedQA-USMLE-4-options', split='test')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
test_data_df = data.to_pandas()

In [7]:
test_data_df.head()

Unnamed: 0,question,answer,options,meta_info,answer_idx,metamap_phrases
0,A junior orthopaedic surgery resident is compl...,Tell the attending that he cannot fail to disc...,{'A': 'Disclose the error to the patient and p...,step1,B,"[junior orthopaedic surgery resident, completi..."
1,A 67-year-old man with transitional cell carci...,Cross-linking of DNA,"{'A': 'Inhibition of proteasome', 'B': 'Hypers...",step1,D,"[67 year old man, transitional cell carcinoma ..."
2,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,"{'A': 'Renal papillary necrosis', 'B': 'Choles...",step2&3,B,"[Two weeks, emergency cardiac, stenting, unsta..."
3,A 39-year-old woman is brought to the emergenc...,"Lactose-fermenting, gram-negative rods forming...","{'A': 'Coagulase-positive, gram-positive cocci...",step1,D,"[year old woman, brought, emergency department..."
4,A 35-year-old man comes to the physician becau...,Ketotifen eye drops,"{'A': 'Erythromycin ointment', 'B': 'Ketotifen...",step2&3,B,"[35 year old man, physician, of itchy, watery,..."


# Baseline

- Verifies the accuracy of the random guessing answers
- Verifies the accuracy with zero-shot prompt

Notebook output: accuracy in test dataset

## Random guessing

In [8]:
def guessing_accuracy(df, step=None):
  '''
  Calculates the accuracy of guessing the answer randomly.
  df (Dataframe): Dataframe with the questions
  step (str): Step of the question (step1, step2&3)

  Returns:
  (float, float): Mean and standard deviation of the accuracy
  '''

  res = list()
  rng = np.random.default_rng(42)

  if step is None: #calculates for the whole dataset
      step_data = df.copy()
  else: #calculates for the given step
      step_data = df[df['meta_info'].eq(step)].copy()

  #makes the random guess 100 times
  for _ in range(100):
      # gets a random answer from A to D for all questions and calculates the accuracy
      step_data['random_guess'] = np.random.choice(['A', 'B', 'C', 'D'], size=len(step_data))
      res.append(step_data['answer_idx'].eq(step_data['random_guess']).mean())

  res = np.array(res)
  return res.mean(), res.std()

In [9]:
# random guessing accuracy
guessing_accuracy(test_data_df)

(0.2514846818538885, 0.011526408702273326)

In [10]:
# random guessing accuracy in step1 questions
guessing_accuracy(test_data_df, step='step1')

(0.24933726067746684, 0.015252073031817364)

In [11]:
# random guessing accuracy in step2&3 questions
guessing_accuracy(test_data_df, step='step2&3')

(0.24996632996632998, 0.01816073100642074)

In [12]:
# question length in number of characters
test_data_df['question'].str.len().describe()

Unnamed: 0,question
count,1273.0
mean,745.579733
std,304.233918
min,67.0
25%,536.0
50%,715.0
75%,909.0
max,3543.0


## Model

In [13]:
client = OpenAI()

Testing gpt-4o-mini model

In [14]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "What's your current version"
        }
    ]
)
response.choices[0].message.content

"I’m based on OpenAI's GPT-3 model, and my training includes data up to October 2021. I don’t have a version number like traditional software; instead, I use the capabilities of the GPT-3 model. If you have specific questions or need assistance, feel free to ask!"

In [15]:
model = 'gpt-4o-mini'
model_temp = 0.5

In [16]:
llm = ChatOpenAI(model=model, temperature=model_temp)

In [17]:
class Answer(BaseModel):
    answer: Literal['A', 'B', 'C', 'D'] = Field(description='Final answer to the question')

## Vanilla Zero-shot

In [18]:
prompt = '''
 question: {question}
 options: {options}
 Among A through D, the answer is:
'''.strip()

In [19]:
#Creates a ChatPromptTemplate object from a raw template string.
prompt_template = ChatPromptTemplate.from_template(prompt)

#Creates a processing chain that combines a prompt template with a language model to generate statements and then structutures the output
chain_prompt = prompt_template | llm.with_structured_output(Answer, include_raw=True)

In [20]:
answers = list()
#saves the outputs in a txt file
with open(f'/content/drive/MyDrive/IA024A_Processamento_de_Linguagem_Natural/Projeto Final/projeto/entrega2/answers_vanilla_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.txt', 'w') as f:

    f.write(''.center(10, '-'))
    f.write('\n')
    f.write(f'Model: {model}\n')
    f.write(f'Temperature: {model_temp}\n')
    f.write(f'Prompt: {prompt}\n')
    f.write(''.center(10, '-'))
    f.write('\n')

    f.write(';Answer\n')
    for n, question in enumerate(tqdm(data)):

        #Generate the answers
        res = chain_prompt.invoke({'question': question['question'], 'options': question['options']})

        #Stores the answers if there is no error
        if res['parsing_error'] is None:
            answers.append(res['parsed'].answer)
            f.write(f'{n};{res["parsed"].answer}"\n')
        else:
            answers.append(None)
            f.write(f'{n};\n')

100%|██████████| 1273/1273 [13:44<00:00,  1.54it/s]


In [21]:
#Number of questions the model answered
len(answers)

1273

In [22]:
res

{'raw': AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_1COrD3jmPXAM2DZgXdUGI58s', 'function': {'arguments': '{"answer":"C"}', 'name': 'Answer'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 275, 'total_tokens': 280, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'stop', 'logprobs': None}, id='run-a40fb3cf-406d-4ee4-8b1b-97003df18b64-0', tool_calls=[{'name': 'Answer', 'args': {'answer': 'C'}, 'id': 'call_1COrD3jmPXAM2DZgXdUGI58s', 'type': 'tool_call'}], usage_metadata={'input_tokens': 275, 'output_tokens': 5, 'total_tokens': 280, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}}),
 'parsed': Answer(answer='C'),
 'parsing_error': None}

In [23]:
data2 = data.add_column('answer_gpt4', answers)

In [24]:
data2_df = data2.to_pandas()
data2_df.to_csv('/content/drive/MyDrive/IA024A_Processamento_de_Linguagem_Natural/Projeto Final/projeto/entrega2/vanilla/answers_vanilla.csv')

In [25]:
#Dataset with model answers and correct answers
data2_df

Unnamed: 0,question,answer,options,meta_info,answer_idx,metamap_phrases,answer_gpt4
0,A junior orthopaedic surgery resident is compl...,Tell the attending that he cannot fail to disc...,{'A': 'Disclose the error to the patient and p...,step1,B,"[junior orthopaedic surgery resident, completi...",A
1,A 67-year-old man with transitional cell carci...,Cross-linking of DNA,"{'A': 'Inhibition of proteasome', 'B': 'Hypers...",step1,D,"[67 year old man, transitional cell carcinoma ...",C
2,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,"{'A': 'Renal papillary necrosis', 'B': 'Choles...",step2&3,B,"[Two weeks, emergency cardiac, stenting, unsta...",B
3,A 39-year-old woman is brought to the emergenc...,"Lactose-fermenting, gram-negative rods forming...","{'A': 'Coagulase-positive, gram-positive cocci...",step1,D,"[year old woman, brought, emergency department...",B
4,A 35-year-old man comes to the physician becau...,Ketotifen eye drops,"{'A': 'Erythromycin ointment', 'B': 'Ketotifen...",step2&3,B,"[35 year old man, physician, of itchy, watery,...",B
...,...,...,...,...,...,...,...
1268,A 39-year-old woman presents to the clinic for...,Thyroid-stimulating hormone (TSH),"{'A': 'Glucose', 'B': 'Triiodothyronine (T3)',...",step2&3,D,"[year old woman presents, clinic, annual physi...",D
1269,A 38-year-old woman comes to the physician bec...,Medication abuse,"{'A': 'Celiac disease', 'B': 'Carcinoid tumor'...",step2&3,D,"[year old woman, physician, persistent diarrhe...",A
1270,A 21-year-old college student comes to the phy...,Stop evening exercise,"{'A': 'Trial of diphenhydramine', 'B': 'Stop e...",step2&3,B,"[21-year-old college student, physician, of da...",D
1271,A 19-year-old man is brought to the physician ...,Schizotypal personality disorder,"{'A': 'Social anxiety disorder', 'B': 'Avoidan...",step2&3,C,"[year old man, brought, physician, mother, wor...",C


In [26]:
#Accuracy for all questions
accuracy = data2_df['answer_idx'].eq(data2_df['answer_gpt4']).mean()
print(f"Accuracy of LLM answers: {accuracy}")

Accuracy of LLM answers: 0.7384131971720346


In [27]:
#vanilla accuracy for step1 questions
step1 = data2_df[data2_df['meta_info'].eq('step1')].copy()
step1_accuracy = step1['answer_idx'].eq(step1['answer_gpt4']).mean()
print(f"Accuracy of LLM answers in step1 questions: {step1_accuracy}")

Accuracy of LLM answers in step1 questions: 0.7407952871870398


In [28]:
#vanilla accuracy for step2&3 questions
step23 = data2_df[data2_df['meta_info'].eq('step2&3')].copy()
step23_accuracy = step23['answer_idx'].eq(step23['answer_gpt4']).mean()
print(f"Accuracy of LLM answers in step2&3 questions: {step23_accuracy}")

Accuracy of LLM answers in step2&3 questions: 0.7356902356902357


In [29]:
# distribution of wrong answers
data2_df[~data2_df['answer_idx'].eq(data2_df['answer_gpt4'])].groupby('answer_gpt4').size()

Unnamed: 0_level_0,0
answer_gpt4,Unnamed: 1_level_1
A,99
B,87
C,76
D,71
