# Sistemas inteligentes para respostas a perguntas médicas

Gyovana M. Moriyama (216190)

Rafael A. Matumoto (273085)

In [None]:
!pip -q install datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import datasets
import pandas as pd
import numpy as np
import os

from google.colab import userdata, drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Data processing



```
data_clean/
|-- questions/
|   |-- Mainland/
|   |-- Taiwan/
|   |-- US/
|       |-- 4_options/
|       |   |-- phrases_no_exclude_dev.jsonl
|       |   |-- phrases_no_exclude_test.jsonl
|       |   |-- phrases_no_exclude_train.jsonl
|       |-- dev.jsonl
|       |-- test.jsonl
|       |-- train.jsonl
|       |-- ...
|-- textbooks/
|   |-- en/
|   |   |-- Anatomy_Gray.txt
|   |   |-- ...
|   |-- zh_paragraph/
|   |-- zh_sentence/
```



### Load dataset

In [None]:
# download MedQA data from https://drive.google.com/file/d/1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw/view?usp=sharing
!gdown -q 1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw
!unzip -q data_clean.zip -d /content/medQA

In [None]:
def load_medqa_questions(split, meta_phrases=False):
    '''
    Loads downloaded MedQA data as a dataset object.
    If meta_phrases is set, a list of metamap phrases related to the questions is provided.
    Original questions with 5 alternatives. Dataset in HuggingFace was processed to contain 4 choices.
    '''

    assert split in ['train', 'test', 'dev'], "Invalid split. Must be one of 'train', 'test', or 'dev'."

    if meta_phrases:
        medqa_path = '/content/medQA/data_clean/questions/US/4_options/phrases_no_exclude_'
    else:
        medqa_path = '/content/medQA/data_clean/questions/US/'

    if split == 'train':
        medqa_path += 'train.jsonl'
    elif split == 'test':
        medqa_path += 'test.jsonl'
    elif split == 'dev':
        medqa_path += 'dev.jsonl'

    data = []
    with open(medqa_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    data = datasets.Dataset.from_list(data)

    return data

In [None]:
def load_medqa_textbooks():
    '''
    Loads reference textbooks for MedQA as a dictionary.
    '''
    medqa_path = '/content/medQA/data_clean/textbooks/en/'

    data = dict()
    for fname in os.listdir(medqa_path):
        with open(os.path.join(medqa_path, fname), 'r') as f:
            data[fname.split('.')[0]] = f.read()

    return data

In [None]:
# train_data = load_medqa_questions(split='train', meta_phrases=False)
# dev_data = load_medqa_questions(split='dev', meta_phrases=False)
# test_data = load_medqa_questions(split='test', meta_phrases=False)

In [None]:
data = datasets.load_dataset('GBaker/MedQA-USMLE-4-options', split='test')

In [None]:
textbooks = load_medqa_textbooks()

### Exploratory analysis

#### Medical questions

In [None]:
test_data_df = data.to_pandas()

In [None]:
# join question and answer alternatives (must be included in the prompt)
test_data_df['quest_ans'] = test_data_df.apply(lambda x: x['question'] + str(x['options']), axis=1)

In [None]:
test_data_df.head()

Unnamed: 0,question,answer,options,meta_info,answer_idx,metamap_phrases,quest_ans
0,A junior orthopaedic surgery resident is compl...,Tell the attending that he cannot fail to disc...,{'A': 'Disclose the error to the patient and p...,step1,B,"[junior orthopaedic surgery resident, completi...",A junior orthopaedic surgery resident is compl...
1,A 67-year-old man with transitional cell carci...,Cross-linking of DNA,"{'A': 'Inhibition of proteasome', 'B': 'Hypers...",step1,D,"[67 year old man, transitional cell carcinoma ...",A 67-year-old man with transitional cell carci...
2,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,"{'A': 'Renal papillary necrosis', 'B': 'Choles...",step2&3,B,"[Two weeks, emergency cardiac, stenting, unsta...",Two weeks after undergoing an emergency cardia...
3,A 39-year-old woman is brought to the emergenc...,"Lactose-fermenting, gram-negative rods forming...","{'A': 'Coagulase-positive, gram-positive cocci...",step1,D,"[year old woman, brought, emergency department...",A 39-year-old woman is brought to the emergenc...
4,A 35-year-old man comes to the physician becau...,Ketotifen eye drops,"{'A': 'Erythromycin ointment', 'B': 'Ketotifen...",step2&3,B,"[35 year old man, physician, of itchy, watery,...",A 35-year-old man comes to the physician becau...


In [None]:
# number of questions by exam step
test_data_df.groupby('meta_info').size()

Unnamed: 0_level_0,0
meta_info,Unnamed: 1_level_1
step1,679
step2&3,594


In [None]:
# distribution of answer alternatives
test_data_df.groupby(['meta_info', 'answer_idx']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
meta_info,answer_idx,Unnamed: 2_level_1
step1,A,187
step1,B,156
step1,C,188
step1,D,148
step2&3,A,166
step2&3,B,153
step2&3,C,158
step2&3,D,117


In [None]:
# question + answer alternatives length in number of characters (must fit in context size)
test_data_df['quest_ans'].str.len().describe()

Unnamed: 0,quest_ans
count,1273.0
mean,888.602514
std,309.890528
min,230.0
25%,679.0
50%,852.0
75%,1053.0
max,3673.0


#### Reference books

In [None]:
def book_stats(books):

    titles = list(books.keys())
    total_num_chars = list()
    total_num_paragraphs = list()
    avg_num_chars_per_paragraph = list()
    avg_num_words_per_paragraph = list()

    for book_content in books.values():
        total_num_chars.append(len(book_content))
        total_num_paragraphs.append(len(book_content.split('\n\n')))
        avg_num_chars_per_paragraph.append(np.mean([len(line) for line in book_content.split('\n\n')], dtype=int))
        avg_num_words_per_paragraph.append(np.mean([len(line.split()) for line in book_content.split('\n\n')], dtype=int))

    df = pd.DataFrame({
        'title': titles,
        'num_chars': total_num_chars,
        'num_paragraphs': total_num_paragraphs,
        'avg_num_chars_per_paragraph': avg_num_chars_per_paragraph,
        'avg_num_words_per_paragraph': avg_num_words_per_paragraph
    })

    return df

In [None]:
books = book_stats(textbooks)

In [None]:
books

Unnamed: 0,title,num_chars,num_paragraphs,avg_num_chars_per_paragraph,avg_num_words_per_paragraph
0,First_Aid_Step2,1030582,6659,152,21
1,Pathology_Robbins,3784898,11398,330,39
2,Histology_Ross,3047020,6690,453,69
3,Cell_Biology_Alberts,4868257,10627,456,71
4,Pathoma_Husain,399834,5306,73,10
5,Pharmacology_Katzung,5122267,12869,396,56
6,Physiology_Levy,3049236,8859,342,46
7,First_Aid_Step1,665018,4245,154,21
8,Immunology_Janeway,3315092,8243,400,60
9,Anatomy_Gray,2281157,9972,226,35


In [None]:
books[['num_chars', 'num_paragraphs']].describe()

Unnamed: 0,num_chars,num_paragraphs
count,18.0,18.0
mean,4951795.0,11851.666667
std,5157709.0,9252.19455
min,399834.0,126.0
25%,2434207.0,6666.75
50%,3182164.0,9415.5
75%,5495379.0,12867.0
max,22312860.0,39521.0
