In [2]:
import pandas as pd
import numpy as np
import re
import json
from random import sample
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [3]:
f = open('./VAERS22/lookup/lookup22.json')
lookup22 = json.load(f)

In [106]:
df = pd.read_excel("./Template/Template-Medium-Text2ESQ.xlsx")

In [152]:
df.iloc[45:55,:]

Unnamed: 0,Question Template ID,Data File,Field,Question Template,Query Template ID,Difficulty
45,85,VAERSDATA;VAERSSYMPTOMS,HISTORY;SYMPTOM1/SYMPTOM2/SYMPTOM3,Find all records where people had a history of...,12,Medium
46,86,VAERSDATA;VAERSVAX,HISTORY;VAX_NAME,Retrieve all records that people took [VAX_NAM...,12,Medium
47,87,VAERSDATA;VAERSSYMPTOMS,ALLERGIES;SYMPTOM1/SYMPTOM2/SYMPTOM3,Return all records that people have an allergy...,12,Medium
48,88,VAERSDATA,ALLERGIES;SYMPTOM_TEXT,Return all records that people have an allergy...,12,Medium
49,89,VAERSDATA;VAERSVAX,ALLERGIES;VAX_NAME,Return all the records where people took [VAX_...,12,Medium
50,90,VAERSDATA,SYMPTOM_TEXT;VAX_NAME;OTHER_MEDS,Return a list of records of those who have [SY...,13,Medium
51,91,VAERSDATA;VAERSVAX,SYMPTOM_TEXT;HISTORY,How many cases where people had [SYMPTOM_TXT] ...,13,Medium
52,92,VAERSDATA;VAERSVAX,SYMPTOM_TEXT;VAX_NAME,Are there any cases where [SYMPTOM_TXT] and [S...,13,Medium
53,93,VAERSDATA,AGE_YRS,Return a list of all patients whose age range ...,14,Medium
54,94,VAERSDATA,VAX_DATE,How many patients got vaccines between [VAX_DA...,15,Medium


In [155]:
def find_key(tokens, pattern):
    key_idx = []
    for idx, x in enumerate(tokens):
        token_key = re.findall(pattern, x)
        if token_key != []:
            key_idx.append((token_key[0], idx))
    return key_idx

def randsample(lookup_list):
    # random sample the lookup table value based on the total number of value in one field 
    if len(lookup_list) <= 100:
        value_list = lookup_list
    elif len(lookup_list) > 100:
        value_list = sample(lookup_list, 100)
    return value_list

def create_mini_lookup(keys_tmpl, lookup_table, num=150):
    out = []
    for x in range(num):
        temp = [sample(lookup_table[keys_tmpl[x][0]], 1)[0] for x in range(len(keys_tmpl))]
        out.append(temp)
    return out

In [156]:
def BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl):
    
    BIO = []
    token_question = word_tokenize(question)
    v_len = [len(word_tokenize(str(x))) for x in values_tmpl]
    BIO = ['O' for x in range(keys_tmpl[0][1])]
    n = 0
    for idx, l in enumerate(v_len):
        for x in range(l):
            if x == 0:
                beginning = 'B-{0}'.format(keys_tmpl[idx][0])
                BIO.append(beginning)
            else: 
                inside = 'I-{0}'.format(keys_tmpl[idx][0])
                BIO.append(inside)
        try:
            ph_idx_diff = keys_tmpl[idx+1][1] - keys_tmpl[idx][1] - n
            if ph_idx_diff > 1:
                for x in range(ph_idx_diff -1):
                    BIO.append("O")
        except:
            sen_idx_diff = len(token_tmpl) - 1 - keys_tmpl[idx][1]
            if sen_idx_diff > 0:
                for x in range(sen_idx_diff):
                    BIO.append("O")
                    
    output = []
    for x in list(zip(token_question, BIO)):
        temp_dict = {'text': x[0], 'label':x[1]}
        output.append(temp_dict)
            
    return output


def sub_BIO(q, lookup_table):
    pattern = re.compile(r'\[(.*?)\]', re.S)
    questions= []
    tokens = []
    
    tokenizer = RegexpTokenizer('\w+|\[\w+\]|\S+')
    token_tmpl = tokenizer.tokenize(q)
    keys_tmpl = find_key(token_tmpl, pattern) #[(placeholder1, idx1), (placeholder2, idx2)]
    
    if len(keys_tmpl) == 1:
        for v in randsample(lookup_table[keys_tmpl[0][0]]):
            values_tmpl = [v]
            question = re.sub(r"\[(.*?)\]", lambda x: str(v), q)
            questions.append(question)
            token_bio = BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl)
            tokens.append(token_bio)
            
    elif len(keys_tmpl) > 1:
        mini_lookup = create_mini_lookup(keys_tmpl, lookup_table)
        for vals in mini_lookup:
            for idx, val in enumerate(vals):
                if idx == 0:
                    temp = re.sub(r"\[(.*?)\]", lambda x: str(val), q, count=1)
                else:
                    question = re.sub(r"\[(.*?)\]", lambda x: str(val), temp, count=1)
                    temp = question
            questions.append(question)
            token_bio = BIO_tagging(question, vals, keys_tmpl, token_tmpl)
            tokens.append(token_bio)
    
    # Need to add BIO tagging manually for questions that don't have a key/placeholder.
    # Will update this part later

    output = {'Questions':questions, 'BIO_tagging':tokens}
            
    return output
                   

In [157]:
def create_id(question, query, index):
    
    # generate id with a format of "Question template id" + "Query template id" + "Index"
    # example: "01100001" 
    # the first two digits "01" indicates the question template id 1
    # the third digit "1" indicates the query template id 1
    # the last five digits "00001" indicates the index of the question 1
    
    question = str(question)
    query = str(query)
    index = str(index)
    
    if len(question) == 1:
        question = '0' + question
    
    diff = 5 - len(index)
    if diff < 5:
        index = '0'*diff + index
    
    output = question + query + index
    
    return output

def populate_questions(dataset, lookup_table):
    df = dataset.copy()
    df['Question Template'] = df['Question Template'].apply(lambda x: x.split("##"))
    df = df.explode('Question Template').reset_index().drop(columns='index')
    df = df[df['Difficulty'] == 'Medium']
    
    df['output'] = df['Question Template'].apply(lambda x: sub_BIO(x, lookup_table))
    df['Questions'] = df.output.apply(lambda x: x['Questions'])
    df['Tokens'] = df.output.apply(lambda x: x['BIO_tagging'])
    df = df.drop(columns='output')
    
    df = df.apply(lambda x: x.explode()).reset_index()

    df['index'] = df.index.values
    df['Query Template ID'] = df['Query Template ID'].apply(lambda x: int(x))
    df['id'] = list(map(lambda x, y, z: create_id(x, y, z), \
                        df['Question Template ID'], \
                        df['Query Template ID'], \
                        df['index']))
    df = df.drop(columns='index')

    return df

In [153]:
# 51, 51
q = df.iloc[50,-3]
q = q.split('##')[0]
print('q1',q)

pattern = re.compile(r'\[(.*?)\]', re.S)
questions= []
tokens = []

tokenizer = RegexpTokenizer('\w+|\[\w+\]|\S+')
token_tmpl = tokenizer.tokenize(q)
keys_tmpl = find_key(token_tmpl, pattern) #[(placeholder1, idx1), (placeholder2, idx2)]
print('keys_tmpl: ',keys_tmpl)

mini_lookup = create_mini_lookup(keys_tmpl, lookup22, 1)
print(mini_lookup)

for vals in mini_lookup:
    for idx, val in enumerate(vals):
        if idx == 0:
            temp = re.sub(r"\[(.*?)\]", lambda x: str(val), q, count=1)
        else:
            question = re.sub(r"\[(.*?)\]", lambda x: str(val), temp, count=1)
            temp = question
    questions.append(question)
    token_bio = BIO_tagging(question, vals, keys_tmpl, token_tmpl)
    tokens.append(token_bio)
    
tokens[0]

q1 Return a list of records of those who have [SYMPTOM_TXT] after taking [VAX_NAME] vaccine while taking [OTHER_MEDS] at the time of vaccination. 
keys_tmpl:  [('SYMPTOM_TXT', 9), ('VAX_NAME', 12), ('OTHER_MEDS', 16)]
[['Hepatitis B surface antibody negative', 'ANTHRAX (NO BRAND NAME)', 'goli']]
9
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'I-SYMPTOM_TXT', 'O', 'O', 'B-VAX_NAME', 'I-VAX_NAME', 'I-VAX_NAME', 'I-VAX_NAME', 'I-VAX_NAME', 'I-VAX_NAME', 'O', 'O', 'O']


[{'text': 'Return', 'label': 'O'},
 {'text': 'a', 'label': 'O'},
 {'text': 'list', 'label': 'O'},
 {'text': 'of', 'label': 'O'},
 {'text': 'records', 'label': 'O'},
 {'text': 'of', 'label': 'O'},
 {'text': 'those', 'label': 'O'},
 {'text': 'who', 'label': 'O'},
 {'text': 'have', 'label': 'O'},
 {'text': 'Hepatitis', 'label': 'B-SYMPTOM_TXT'},
 {'text': 'B', 'label': 'I-SYMPTOM_TXT'},
 {'text': 'surface', 'label': 'I-SYMPTOM_TXT'},
 {'text': 'antibody', 'label': 'I-SYMPTOM_TXT'},
 {'text': 'negative', 'label': 'I-SYMPTOM_TXT'},
 {'text': 'after', 'label': 'O'},
 {'text': 'taking', 'label': 'O'},
 {'text': 'ANTHRAX', 'label': 'B-VAX_NAME'},
 {'text': '(', 'label': 'I-VAX_NAME'},
 {'text': 'NO', 'label': 'I-VAX_NAME'},
 {'text': 'BRAND', 'label': 'I-VAX_NAME'},
 {'text': 'NAME', 'label': 'I-VAX_NAME'},
 {'text': ')', 'label': 'I-VAX_NAME'},
 {'text': 'vaccine', 'label': 'O'},
 {'text': 'while', 'label': 'O'},
 {'text': 'taking', 'label': 'O'},
 {'text': 'goli', 'label': 'B-OTHER_MEDS'},
 

In [158]:
medium = populate_questions(df, lookup22)

In [165]:
medium.to_csv('./VAERS22/medium.csv')

In [166]:
medium

Unnamed: 0,Question Template ID,Data File,Field,Question Template,Query Template ID,Difficulty,Questions,Tokens,id
0,40,VAERSDATA,STATE;AGE_YRS,Retrieve all the records where patients are fr...,7,Medium,Retrieve all the records where patients are fr...,"[{'text': 'Retrieve', 'label': 'O'}, {'text': ...",40700000
1,40,VAERSDATA,STATE;AGE_YRS,Retrieve all the records where patients are fr...,7,Medium,Retrieve all the records where patients are fr...,"[{'text': 'Retrieve', 'label': 'O'}, {'text': ...",40700001
2,40,VAERSDATA,STATE;AGE_YRS,Retrieve all the records where patients are fr...,7,Medium,Retrieve all the records where patients are fr...,"[{'text': 'Retrieve', 'label': 'O'}, {'text': ...",40700002
3,40,VAERSDATA,STATE;AGE_YRS,Retrieve all the records where patients are fr...,7,Medium,Retrieve all the records where patients are fr...,"[{'text': 'Retrieve', 'label': 'O'}, {'text': ...",40700003
4,40,VAERSDATA,STATE;AGE_YRS,Retrieve all the records where patients are fr...,7,Medium,Retrieve all the records where patients are fr...,"[{'text': 'Retrieve', 'label': 'O'}, {'text': ...",40700004
...,...,...,...,...,...,...,...,...,...
9895,97,VAERSDATA;VAERSVAX,AGE_YRS;VAX_NAME,Give me a list of patients who range from [AGE...,17,Medium,Give me a list of patients who range from 37.0...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",971709895
9896,97,VAERSDATA;VAERSVAX,AGE_YRS;VAX_NAME,Give me a list of patients who range from [AGE...,17,Medium,Give me a list of patients who range from 0.17...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",971709896
9897,97,VAERSDATA;VAERSVAX,AGE_YRS;VAX_NAME,Give me a list of patients who range from [AGE...,17,Medium,Give me a list of patients who range from 29.0...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",971709897
9898,97,VAERSDATA;VAERSVAX,AGE_YRS;VAX_NAME,Give me a list of patients who range from [AGE...,17,Medium,Give me a list of patients who range from 73.0...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",971709898


In [159]:
medium.iloc[7888]

Question Template ID                                                   90
Data File                                                       VAERSDATA
Field                                    SYMPTOM_TEXT;VAX_NAME;OTHER_MEDS
Question Template       Return a list of records of those who have [SY...
Query Template ID                                                      13
Difficulty                                                         Medium
Questions               Return a list of records of those who have Voc...
Tokens                  [{'text': 'Return', 'label': 'O'}, {'text': 'a...
id                                                              901307888
Name: 7888, dtype: object

In [115]:
list(medium)

['Question Template ID',
 'Data File',
 'Field ',
 'Question Template',
 'Query Template ID',
 'Difficulty',
 'Questions',
 'Tokens',
 'id']

In [162]:
out = []
for x in tqdm(range(len(medium))):
    temp = {}
    temp['question'] = medium.iloc[x]['Questions']
    temp['query'] = str(medium.iloc[x]['Query Template ID'])
    temp['token'] = medium.iloc[x]['Tokens']
    out.append(temp)

100%|████████████████████████████████████████████████████████████████████████████| 9900/9900 [00:01<00:00, 5013.98it/s]


In [167]:
f = open('./VAERS22/medium_query.json', 'w')
json.dump(out, f)
f.close()

In [164]:
out[7888]

{'question': 'Return a list of records of those who have Vocal cord paralysis after taking TETANUS DIPHTHERIA (NO BRAND NAME) vaccine while taking coreg at the time of vaccination. ',
 'query': '13',
 'token': [{'text': 'Return', 'label': 'O'},
  {'text': 'a', 'label': 'O'},
  {'text': 'list', 'label': 'O'},
  {'text': 'of', 'label': 'O'},
  {'text': 'records', 'label': 'O'},
  {'text': 'of', 'label': 'O'},
  {'text': 'those', 'label': 'O'},
  {'text': 'who', 'label': 'O'},
  {'text': 'have', 'label': 'O'},
  {'text': 'Vocal', 'label': 'B-SYMPTOM_TXT'},
  {'text': 'cord', 'label': 'I-SYMPTOM_TXT'},
  {'text': 'paralysis', 'label': 'I-SYMPTOM_TXT'},
  {'text': 'after', 'label': 'O'},
  {'text': 'taking', 'label': 'O'},
  {'text': 'TETANUS', 'label': 'B-VAX_NAME'},
  {'text': 'DIPHTHERIA', 'label': 'I-VAX_NAME'},
  {'text': '(', 'label': 'I-VAX_NAME'},
  {'text': 'NO', 'label': 'I-VAX_NAME'},
  {'text': 'BRAND', 'label': 'I-VAX_NAME'},
  {'text': 'NAME', 'label': 'I-VAX_NAME'},
  {'text'