In [1]:
from tqdm import tqdm
import random
import requests
import pandas as pd
from string import ascii_letters

In [2]:
df = pd.read_csv('../drug_datasets/dataset_files/drugs_dataset.csv')
medicines = list(df['medicine'].values)

In [3]:
def question_generator(drug):
    question = f"""\
When should you not take {drug}?
Is there any time when you shouldn't take {drug}?
Do you need to avoid taking {drug}?
When should you avoid taking {drug}?
What to avoid while taking {drug}?
When taking {drug}, what should I avoid?
What should one avoid when using {drug}?
Which things should you avoid while taking {drug}?
What are {drug} side-effects?
What are side-effects of {drug}
What condition is a serious side effect of {drug} use?
What are the dangers of {drug}?
{drug}
It is {drug}
I'm looking for {drug} detail
What is {drug}?
Can you give me information about {drug}?
Are you able to provide me with information about {drug}?
How should I take {drug}?
Give me {drug} dosage information
I need to know the dosage of {drug}
Tell me the dosage of {drug}
Tell me about {drug} dosage
What other drugs will affect {drug}?
What drugs should not be taken with {drug}?
Which drugs have a drug interaction with {drug}?
What other drugs will affect {drug}?"""
    return question

In [4]:
# list of all changed medicines 
changed_medicines = []

In [5]:
# randomly change letters
for medicine in medicines:
    for i in range(5):
        inds = [i for i, letter in enumerate(medicine) if not letter.isspace()]
        ind = random.sample(inds, 1)

        lst = list(medicine)
        lst[ind[0]] = random.choice(ascii_letters)
        changed_medicines.append("".join(lst))

In [6]:
# randomly remove letters
for medicine in medicines:
    for i in range(2):
        inds = [i for i, letter in enumerate(medicine) if not letter.isspace()]
        ind = random.sample(inds, 1)

        lst = list(medicine)
        lst[ind[0]] = ''
        changed_medicines.append("".join(lst))

In [7]:
# Also append correct words 
changed_medicines += medicines

In [8]:
# list of all successful entity extractors
entity_extractors = []

In [9]:
for medicine in tqdm(changed_medicines):
    questions = question_generator(medicine)
    for question in questions.split('\n'): 
        post_fields = {"text" : question}
        r = requests.post("http://localhost:5005/model/parse", json=post_fields)
        dic = r.json()
        entities = dic['entities']

        for entity in entities:
            entity_extractors.append(entity['extractor'])

100%|██████████| 7968/7968 [1:42:26<00:00,  1.30it/s]


In [10]:
len(changed_medicines)

7968

In [11]:
len(medicines)

996

In [12]:
extractors = set(entity_extractors)
extractor_statistic = dict(zip(extractors, [0] * len(extractors)))

In [13]:
# statistic of successful entity extractors
for extractor in extractors:
    extractor_statistic[extractor] = entity_extractors.count(extractor)

In [14]:
# after RegexFeaturizer
extractor_statistic

{'DIETClassifier': 1516,
 'RegexEntityExtractor': 35181,
 'SpacyEntityExtractor': 22746}

In [None]:
# before RegexFeaturizer
extractor_statistic

{'RegexEntityExtractor': 35505, 'CRFEntityExtractor': 90, 'DIETClassifier': 596, 'SpacyEntityExtractor': 23164}

In [2]:
df = pd.read_csv('../medlineplus_lab_dataset/dataset_files/medplus_labs.csv')
labs = list(df['Lab test'].values)

In [3]:
def question_generator(lab):
    question = f"""\
Is there anything else I need to know about the {lab}?
What else should I know about an {lab}?
Can you tell me anything else about the {lab}?
Any other information I need to know about the {lab}?
What do the {lab} test results mean?
What do the results of the {lab} test mean?
How are the results from the {lab} test interpreted?
The {lab} result means what?
The results of the {lab} test mean what?
Are there any risks to the {lab}?
Is there any risk associated with an {lab}?
Do the {lab} carry any risks?
{lab} poses any risks?
Is it possible that the {lab} may have risks?
Will I need to do anything to prepare for the {lab}?
How should I prepare for the {lab}?
Does anything need to be done to prepare for the {lab}?
Will I need to do anything in advance for the {lab}?
Does there are any preparations I need to make for the {lab}?
What do I need to do to prepare for the {lab}?
What happens during the {lab}?
During an {lab} what happens?
Is there anything that happens during {lab}?
What happens in {lab}?
What will happen in the {lab}?
Why do I need the {lab}?
Is {lab} necessary for me?
Can you tell me why I need the {lab}?
Is it necessary that I go to the {lab}?
What is the {lab}?
The {lab} is what?
I'm looking for {lab} detail
Can you give me information about {lab}?
What is the {lab} used for?
What is the purpose of the {lab}?
{lab} is used for what?
For what do you use the {lab}?
{lab}
It is {lab}"""
    return question

In [4]:
changed_labs = []

In [5]:
for lab in labs:
    for i in range(5):
        inds = [i for i, letter in enumerate(lab) if not letter.isspace()]
        ind = random.sample(inds, 1)

        lst = list(lab)
        lst[ind[0]] = random.choice(ascii_letters)
        changed_labs.append("".join(lst))

In [6]:
for lab in labs:
    for i in range(2):
        inds = [i for i, letter in enumerate(lab) if not letter.isspace()]
        ind = random.sample(inds, 1)

        lst = list(lab)
        lst[ind[0]] = ''
        changed_labs.append("".join(lst))

In [7]:
changed_labs += labs

In [8]:
entity_extractors = []

In [10]:
for lab in tqdm(changed_labs):
    questions = question_generator(lab)
    for question in questions.split('\n'): 
        post_fields = {"text" : question}
        r = requests.post("http://localhost:5005/model/parse", json=post_fields)
        dic = r.json()
        entities = dic['entities']

        for entity in entities:
            entity_extractors.append(entity['extractor'])

In [13]:
extractors = set(entity_extractors)
extractor_statistic = dict(zip(extractors, [0] * len(extractors)))

In [14]:
for extractor in extractors:
    extractor_statistic[extractor] = entity_extractors.count(extractor)

In [15]:
extractor_statistic

{'DIETClassifier': 3251,
 'SpacyEntityExtractor': 10560,
 'RegexEntityExtractor': 11661}

In [9]:
len(changed_labs)

2008