In [1]:
import pandas as pd
import numpy as np
import re
import json
from random import sample
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

# Generate Lookup table

**Fields only need unqiue:**
 1. DATA: 'RECVDATE', 'STATE', 'AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'SEX', 'RPT_DATE','DIED', 'DATEDIED', 'L_THREAT', 'ER_VISIT', 'HOSPITAL', 'HOSPDAYS', 'X_STAY', 'DISABLE', 'RECOVD', 'VAX_DATE', 'ONSET_DATE', 'NUMDAYS','V_ADMINBY', 'V_FUNDBY','SPLTTYPE', 'FORM_VERS', 'TODAYS_DATE', 'BIRTH_DEFECT', 'OFC_VISIT', 'ER_ED_VISIT'
 2. VAX: 'VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_ROUTE','VAX_SITE','VAX_NAME'

**Text fields need regx/tokenize:**
 1. DATA: 'LAB_DATA','OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX','ALLERGIES'

**Equivalent:**
 1. DATA:'SYMPTOM_TEXT' = SYMPTOMS: Symptoms list



In [2]:
def vax_name(field):
    value = []
    for x in field:
        x = re.sub(r'\s{1}\(.*\)', '', x)
        x = x.split(' + ')
        value += x[:]
    value = list(set(value))
    return value

In [3]:
def build_lookup(data, vax, symp):
    datasets = []
    for f in [data, vax, symp]:
        df = pd.read_csv(f, encoding = 'Windows-1252')
        datasets.append(df)
        
    text_fields = ['LAB_DATA','OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX','ALLERGIES']
    symp_list = ['SYMPTOM1', 'SYMPTOM2','SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
    lookup_dict = {}
    SYMPTOM = []
    
    for dataset in datasets:
        cols = list(dataset)
        cols.remove('VAERS_ID')
        for x in cols:
            value = []
            if (x not in text_fields) and (x != 'SYMPTOM_TEXT'):
                value = dataset[x].dropna().unique().tolist()
                lookup_dict[x] = value
            if x in symp_list:
                temp = dataset[x].dropna().unique().tolist()
                SYMPTOM += temp 
        SYMPTOM = list(set(SYMPTOM))
    lookup_dict['SYMPTOM'] = SYMPTOM
    lookup_dict['SYMPTOM_TXT'] = SYMPTOM
    lookup_dict['PRIOR_VAX'] = vax_name(lookup_dict['VAX_NAME'])
    return lookup_dict          

In [4]:
lookup2021 = build_lookup(data='2021VAERSDATA.csv', vax='2021VAERSVAX.csv', symp='2021VAERSSYMPTOMS.csv')

  lookup2021 = build_lookup(data='2021VAERSDATA.csv', vax='2021VAERSVAX.csv', symp='2021VAERSSYMPTOMS.csv')


In [5]:
fout = open("lookup2021.json", "w")
json.dump(lookup2021, fout)
fout.close()

# Populate Questions & BIO Tagging

In [6]:
df_lt = pd.read_excel("Text2ESQ_lt.xlsx")

In [7]:
f = open('lookup2021.json',) 
lookup = json.load(f)

In [8]:
def find_key(tokens, pattern):
    key_idx = []
    for idx, x in enumerate(tokens):
        token_key = re.findall(pattern, x)
        if token_key != []:
            key_idx.append((token_key[0], idx))
    return key_idx

def BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl):
    
    BIO = []
    token_question = word_tokenize(question)
    v_len = [len(word_tokenize(str(x))) for x in values_tmpl]
    BIO = ['O' for x in range(keys_tmpl[0][1])]
    for idx, l in enumerate(v_len):
        for x in range(l):
            if x == 0:
                beginning = 'B-{0}'.format(keys_tmpl[idx][0])
                BIO.append(beginning)
            else: 
                inside = 'I-{0}'.format(keys_tmpl[idx][0])
                BIO.append(inside)
        try:
            ph_idx_diff = keys_tmpl[idx+1][1] - keys_tmpl[idx][1]
            if ph_idx_diff > 1:
                for x in range(ph_idx_diff -1):
                    BIO.append("O")
        except:
            sen_idx_diff = len(token_tmpl) - 1 - keys_tmpl[idx][1]
            if sen_idx_diff > 0:
                for x in range(sen_idx_diff):
                    BIO.append("O")
                    
    output = []
    for x in list(zip(token_question, BIO)):
        temp_dict = {'text': x[0], 'label':x[1]}
        output.append(temp_dict)
            
    return output


def sub_BIO(q, lookup_table):
    pattern = re.compile(r'\[(.*?)\]', re.S)
    questions= []
    tokens = []
    
    tokenizer = RegexpTokenizer('\w+|\[\w+\]|\S+')
    token_tmpl = tokenizer.tokenize(q)
    keys_tmpl = find_key(token_tmpl, pattern) #[(placeholder1, idx1), (placeholder2, idx2)]
    
    if len(keys_tmpl) == 1:
        for v in randsample(lookup_table[keys_tmpl[0][0]]):
            values_tmpl = [v]
            question = re.sub(r"\[(.*?)\]", lambda x: str(v), q)
            questions.append(question)
            token_bio = BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl)
            tokens.append(token_bio)
            
    elif len(keys_tmpl) == 2:
        for v, vs in zip(randsample(lookup_table[keys_tmpl[0][0]]), randsample(lookup_table[keys_tmpl[1][0]])):
            values_tmpl = [v, vs]
            question = re.sub(r"\[(.*?)\]", lambda x: str(v), q, count=1)
            question = re.sub(r"\[(.*?)\]", lambda x: str(vs), question)
            questions.append(question)
            token_bio = BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl)
            tokens.append(token_bio)
    
    # Need to add BIO tagging manually for questions that don't have a key/placeholder.
    # Will update this part later

    output = {'Questions':questions, 'BIO_tagging':tokens}
            
    return output
                   

In [9]:
def randsample(lookup_list):
    # random sample the lookup table value based on the total number of value in one field 
    if len(lookup_list) <= 100:
        value_list = lookup_list
    elif len(lookup_list) > 100 and len(lookup_list) <= 300 :
        value_list = sample(lookup_list, 100)
    elif len(lookup_list) > 300:
        value_list = sample(lookup_list, 200)
    return value_list


def create_id(question, query, index):
    
    # generate id with a format of "Question template id" + "Query template id" + "Index"
    # example: "01100001" 
    # the first two digits "01" indicates the question template id 1
    # the third digit "1" indicates the query template id 1
    # the last five digits "00001" indicates the index of the question 1
    
    question = str(question)
    query = str(query)
    index = str(index)
    
    if len(question) == 1:
        question = '0' + question
    
    diff = 5 - len(index)
    if diff < 5:
        index = '0'*diff + index
    
    output = question + query + index
    
    return output


def populate_questions(dataset, lookup_table):
    df = dataset.copy()
    df['Question Template'] = df['Question Template'].apply(lambda x: x.split("##"))
    df = df.explode('Question Template').reset_index().drop(columns='index')
    df = df[df['Difficulty'] == 'Easy']
    
    df['output'] = df['Question Template'].apply(lambda x: sub_BIO(x, lookup_table))
    df['Questions'] = df.output.apply(lambda x: x['Questions'])
    df['Tokens'] = df.output.apply(lambda x: x['BIO_tagging'])
    df = df.drop(columns='output')
    
    df = df.apply(lambda x: x.explode()).reset_index()

    df['index'] = df.index.values
    df['Query Template ID'] = df['Query Template ID'].apply(lambda x: int(x))
    df['id'] = list(map(lambda x, y, z: create_id(x, y, z), \
                        df['Question Template ID'], \
                        df['Query Template ID'], \
                        df['index']))
    df = df.drop(columns='index')

    return df

In [10]:
df = populate_questions(df_lt, lookup)

In [11]:
df

Unnamed: 0,Question Template ID,Data File,Field,Question Template,Query Template ID,Difficulty,Questions,Tokens,id
0,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'BIO': 'O'}, {'text': 'me', ...",01100000
1,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'BIO': 'O'}, {'text': 'me', ...",01100001
2,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'BIO': 'O'}, {'text': 'me', ...",01100002
3,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'BIO': 'O'}, {'text': 'me', ...",01100003
4,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'BIO': 'O'}, {'text': 'me', ...",01100004
...,...,...,...,...,...,...,...,...,...
8366,49,VAERSSYMPTOMS,"SYMPTOM1，SYMPTOM2，SYMPTOM3,SYMPTOM4，SYMPTOM5",Is there any person have [SYMPTOM] after vacc...,3,Easy,Is there any person have Application site war...,"[{'text': 'Is', 'BIO': 'O'}, {'text': 'there',...",49308366
8367,49,VAERSSYMPTOMS,"SYMPTOM1，SYMPTOM2，SYMPTOM3,SYMPTOM4，SYMPTOM5",Is there any person have [SYMPTOM] after vacc...,3,Easy,Is there any person have Infective pulmonary ...,"[{'text': 'Is', 'BIO': 'O'}, {'text': 'there',...",49308367
8368,49,VAERSSYMPTOMS,"SYMPTOM1，SYMPTOM2，SYMPTOM3,SYMPTOM4，SYMPTOM5",Is there any person have [SYMPTOM] after vacc...,3,Easy,Is there any person have Tongue geographic af...,"[{'text': 'Is', 'BIO': 'O'}, {'text': 'there',...",49308368
8369,49,VAERSSYMPTOMS,"SYMPTOM1，SYMPTOM2，SYMPTOM3,SYMPTOM4，SYMPTOM5",Is there any person have [SYMPTOM] after vacc...,3,Easy,Is there any person have Heart rate irregular...,"[{'text': 'Is', 'BIO': 'O'}, {'text': 'there',...",49308369


In [14]:
tokens = {'tokens': df.Tokens.tolist()}

In [17]:
fout = open("bio2021.json", "w")
json.dump(tokens, fout)
fout.close()