In [28]:
import pandas as pd
import numpy as np
import re
import json
from random import sample
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

# Generate Lookup table

**Fields only need unqiue:**
 1. DATA: 'RECVDATE', 'STATE', 'AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'SEX', 'RPT_DATE','DIED', 'DATEDIED', 'L_THREAT', 'ER_VISIT', 'HOSPITAL', 'HOSPDAYS', 'X_STAY', 'DISABLE', 'RECOVD', 'VAX_DATE', 'ONSET_DATE', 'NUMDAYS','V_ADMINBY', 'V_FUNDBY','SPLTTYPE', 'FORM_VERS', 'TODAYS_DATE', 'BIRTH_DEFECT', 'OFC_VISIT', 'ER_ED_VISIT'
 2. VAX: 'VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_ROUTE','VAX_SITE','VAX_NAME'

**Text fields need regx/tokenize:**
 1. DATA: 'LAB_DATA','OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX','ALLERGIES'

**Equivalent:**
 1. DATA:'SYMPTOM_TEXT' = SYMPTOMS: Symptoms list



In [29]:
def vax_name(field):
    value = []
    for x in field:
        x = re.sub(r'\s{1}\(.*\)', '', x)
        x = x.split(' + ')
        value += x[:]
    value = list(set(value))
    return value

In [30]:
def build_lookup(data, vax, symp):
    datasets = []
    for f in [data, vax, symp]:
        df = pd.read_csv(f, encoding = 'Windows-1252')
        datasets.append(df)
        
    text_fields = ['LAB_DATA','OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX','ALLERGIES']
    symp_list = ['SYMPTOM1', 'SYMPTOM2','SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
    lookup_dict = {}
    SYMPTOM = []
    
    for dataset in datasets:
        cols = list(dataset)
        cols.remove('VAERS_ID')
        for x in cols:
            value = []
            if (x not in text_fields) and (x != 'SYMPTOM_TEXT'):
                value = dataset[x].dropna().unique().tolist()
                lookup_dict[x] = value
            if x in symp_list:
                temp = dataset[x].dropna().unique().tolist()
                SYMPTOM += temp 
        SYMPTOM = list(set(SYMPTOM))
    lookup_dict['SYMPTOM'] = SYMPTOM
    lookup_dict['SYMPTOM_TXT'] = SYMPTOM
    lookup_dict['PRIOR_VAX'] = vax_name(lookup_dict['VAX_NAME'])
    return lookup_dict          

In [32]:
lookup2021 = build_lookup(data='./VAERS22/2022VAERSDATA.csv', vax='./VAERS22/2022VAERSVAX.csv', symp='./VAERS22/2022VAERSSYMPTOMS.csv')

  df = pd.read_csv(f, encoding = 'Windows-1252')


In [33]:
fout = open("lookup2022.json", "w")
json.dump(lookup2021, fout)
fout.close()

# Populate Questions & BIO Tagging

In [35]:
df_lt = pd.read_excel("./Template/Template-Easy-Text2ESQ.xlsx")

In [36]:
f = open('./VAERS22/lookup2022.json',) 
lookup = json.load(f)

In [37]:
def find_key(tokens, pattern):
    key_idx = []
    for idx, x in enumerate(tokens):
        token_key = re.findall(pattern, x)
        if token_key != []:
            key_idx.append((token_key[0], idx))
    return key_idx

def BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl):
    
    BIO = []
    token_question = word_tokenize(question)
    v_len = [len(word_tokenize(str(x))) for x in values_tmpl]
    BIO = ['O' for x in range(keys_tmpl[0][1])]
    for idx, l in enumerate(v_len):
        for x in range(l):
            if x == 0:
                beginning = 'B-{0}'.format(keys_tmpl[idx][0])
                BIO.append(beginning)
            else: 
                inside = 'I-{0}'.format(keys_tmpl[idx][0])
                BIO.append(inside)
        try:
            ph_idx_diff = keys_tmpl[idx+1][1] - keys_tmpl[idx][1]
            if ph_idx_diff > 1:
                for x in range(ph_idx_diff -1):
                    BIO.append("O")
        except:
            sen_idx_diff = len(token_tmpl) - 1 - keys_tmpl[idx][1]
            if sen_idx_diff > 0:
                for x in range(sen_idx_diff):
                    BIO.append("O")
                    
    output = []
    for x in list(zip(token_question, BIO)):
        temp_dict = {'text': x[0], 'label':x[1]}
        output.append(temp_dict)
            
    return output


def sub_BIO(q, lookup_table):
    pattern = re.compile(r'\[(.*?)\]', re.S)
    questions= []
    tokens = []
    
    tokenizer = RegexpTokenizer('\w+|\[\w+\]|\S+')
    token_tmpl = tokenizer.tokenize(q)
    keys_tmpl = find_key(token_tmpl, pattern) #[(placeholder1, idx1), (placeholder2, idx2)]
    
    if len(keys_tmpl) == 1:
        for v in randsample(lookup_table[keys_tmpl[0][0]]):
            values_tmpl = [v]
            question = re.sub(r"\[(.*?)\]", lambda x: str(v), q)
            questions.append(question)
            token_bio = BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl)
            tokens.append(token_bio)
            
    elif len(keys_tmpl) == 2:
        for v, vs in zip(randsample(lookup_table[keys_tmpl[0][0]]), randsample(lookup_table[keys_tmpl[1][0]])):
            values_tmpl = [v, vs]
            question = re.sub(r"\[(.*?)\]", lambda x: str(v), q, count=1)
            question = re.sub(r"\[(.*?)\]", lambda x: str(vs), question)
            questions.append(question)
            token_bio = BIO_tagging(question, values_tmpl, keys_tmpl, token_tmpl)
            tokens.append(token_bio)
    
    # Need to add BIO tagging manually for questions that don't have a key/placeholder.
    # Will update this part later

    output = {'Questions':questions, 'BIO_tagging':tokens}
            
    return output
                   

In [38]:
def randsample(lookup_list):
    # random sample the lookup table value based on the total number of value in one field 
#     if len(lookup_list) <= 100:
#         value_list = lookup_list
#     elif len(lookup_list) > 100 and len(lookup_list) <= 300 :
#         value_list = sample(lookup_list, 100)
#     elif len(lookup_list) > 300:
#         value_list = sample(lookup_list, 150)
#     return value_list
    return lookup_list

def create_id(question, query, index):
    
    # generate id with a format of "Question template id" + "Query template id" + "Index"
    # example: "01100001" 
    # the first two digits "01" indicates the question template id 1
    # the third digit "1" indicates the query template id 1
    # the last five digits "00001" indicates the index of the question 1
    
    question = str(question)
    query = str(query)
    index = str(index)
    
    if len(question) == 1:
        question = '0' + question
    
    diff = 5 - len(index)
    if diff < 5:
        index = '0'*diff + index
    
    output = question + query + index
    
    return output


def populate_questions(dataset, lookup_table):
    df = dataset.copy()
    df['Question Template'] = df['Question Template'].apply(lambda x: x.split("##"))
    df = df.explode('Question Template').reset_index().drop(columns='index')
    df = df[df['Difficulty'] == 'Easy']
    
    df['output'] = df['Question Template'].apply(lambda x: sub_BIO(x, lookup_table))
    df['Questions'] = df.output.apply(lambda x: x['Questions'])
    df['Tokens'] = df.output.apply(lambda x: x['BIO_tagging'])
    df = df.drop(columns='output')
    
    df = df.apply(lambda x: x.explode()).reset_index()

    df['index'] = df.index.values
    df['Query Template ID'] = df['Query Template ID'].apply(lambda x: int(x))
    df['id'] = list(map(lambda x, y, z: create_id(x, y, z), \
                        df['Question Template ID'], \
                        df['Query Template ID'], \
                        df['index']))
    df = df.drop(columns='index')

    return df

In [39]:
df = populate_questions(df_lt, lookup)

In [41]:
df

Unnamed: 0,Question Template ID,Data File,Field,Question Template,Query Template ID,Difficulty,Questions,Tokens,id
0,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",01100000
1,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",01100001
2,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",01100002
3,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",01100003
4,1,VAERSDATA,RECVDATE,Give me all the patients whose information are...,1,Easy,Give me all the patients whose information are...,"[{'text': 'Give', 'label': 'O'}, {'text': 'me'...",01100004
...,...,...,...,...,...,...,...,...,...
272854,50,VAERSSYMPTOMS,SYMPTOM1，SYMPTOM2，SYMPTOM3,which [SYMPTOM] is the most common after vac...,3,Easy,which Liver transplant rejection is the most...,"[{'text': 'which', 'label': 'O'}, {'text': 'Li...",503272854
272855,50,VAERSSYMPTOMS,SYMPTOM1，SYMPTOM2，SYMPTOM3,which [SYMPTOM] is the most common after vac...,3,Easy,which Device loosening is the most common af...,"[{'text': 'which', 'label': 'O'}, {'text': 'De...",503272855
272856,50,VAERSSYMPTOMS,SYMPTOM1，SYMPTOM2，SYMPTOM3,which [SYMPTOM] is the most common after vac...,3,Easy,which Myasthenic syndrome is the most common...,"[{'text': 'which', 'label': 'O'}, {'text': 'My...",503272856
272857,50,VAERSSYMPTOMS,SYMPTOM1，SYMPTOM2，SYMPTOM3,which [SYMPTOM] is the most common after vac...,3,Easy,which Compartment pressure test is the most ...,"[{'text': 'which', 'label': 'O'}, {'text': 'Co...",503272857


In [40]:
questions = df.set_index(['id'])['Questions'].to_dict()

In [47]:
questions

{'01100000': 'Give me all the patients whose information are received on 03/26/2022. ',
 '01100001': 'Give me all the patients whose information are received on 01/11/2022. ',
 '01100002': 'Give me all the patients whose information are received on 02/17/2022. ',
 '01100003': 'Give me all the patients whose information are received on 03/31/2022. ',
 '01100004': 'Give me all the patients whose information are received on 04/17/2022. ',
 '01100005': 'Give me all the patients whose information are received on 03/24/2022. ',
 '01100006': 'Give me all the patients whose information are received on 02/06/2022. ',
 '01100007': 'Give me all the patients whose information are received on 03/04/2022. ',
 '01100008': 'Give me all the patients whose information are received on 05/27/2022. ',
 '01100009': 'Give me all the patients whose information are received on 04/01/2022. ',
 '01100010': 'Give me all the patients whose information are received on 04/08/2022. ',
 '01100011': 'Give me all the pa

### Check query num

In [94]:
# save all questions an corresponding query number
query_q = []
for x in tqdm(range(len(df))):
    temp = {}
    temp['question'] = df.iloc[x]['Questions']
    temp['query'] = str(df.iloc[x]['Query Template ID'])
    query_q.append(temp)

100%|████████████████████████████████████████████████████████████████████████| 272859/272859 [00:31<00:00, 8609.93it/s]


In [53]:
f = open('./VAERS22/Questions.json', 'r')
zkp_q = json.load(f)

In [96]:
zkq_query = []
for x in zkp_q: 
    temp = {}
    temp['question'] = x['Question']
    if x['Field'] == 'PRIOR_VAX': 
        temp['query'] = '1'
    else: 
        temp['query'] = '3'
    zkq_query.append(temp)

In [97]:
query_all = query_q + zkq_query

In [68]:
f = open('./VAERS22/nl_22.json')
data = json.load(f)
questions22 =data['en']

In [99]:
query_dict = {}
for x in query_all:
    if x['question'] in query_dict.keys():
        print("------------------------------------")
        print("1. ",x['question'], ": ", query_dict[x['question']])
        print("2. ",x['question'], ": ", x['query'])
    else: 
        query_dict[x['question']] = x['query']

------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  nan :  2
------------------------------------
1.  nan :  2
2.  n

In [104]:
out_query_qs = []
for x in questions22:
    temp = {}
    temp['question'] = x
    if x not in query_dict.keys():
        temp['query'] = '4'
    else:
        temp['query'] = query_dict[x]
    out_query_qs.append(temp)

In [105]:
len(out_query_qs)

11283

In [106]:
f = open('./VAERS22/query22.json', 'w')
json.dump(out_query_qs, f)
f.close()

### Save file

In [54]:
list(set([x['Field'] for x in zkp_q]))

['ALLERGIES', 'CUR_ILL', 'LAB_DATA', 'HISTORY', 'OTHER_MEDS', 'PRIOR_VAX']

In [15]:
key_frq = {}
for x in zkp_q:
    if x['Field'] not in key_frq.keys():
        key_frq[x['Field']] = 1
    else: 
        cnt = key_frq[x['Field']]
        cnt +=1
        key_frq[x['Field']] = cnt


In [16]:
key_frq

{'ALLERGIES': 300,
 'LAB_DATA': 300,
 'OTHER_MEDS': 300,
 'CUR_ILL': 400,
 'HISTORY': 400,
 'PRIOR_VAX': 300}

In [56]:
def key_list(var_name):
    out = []
    for x in zkp_q: 
        if x['Field'] == var_name:
            out.append(x['Question'])
    return out

In [57]:
ALLERGIES = key_list('ALLERGIES')
CUR_ILL = key_list('CUR_ILL')
LAB_DATA = key_list('LAB_DATA')
HISTORY = key_list('HISTORY')
OTHER_MEDS = key_list('OTHER_MEDS')
priorVax = key_list('PRIOR_VAX')

In [58]:
priorVax

['Give me all the patients who got 4/11/21 before. ',
 'Give me all the patients who got 2003/2011/2014 before. ',
 'Give me all the patients who got 5/30/2021 before. ',
 'Give me all the patients who got 2/2/21 before. ',
 'Give me all the patients who got 11/20/2021 before. ',
 'Give me all the patients who got 9/9/21 before. ',
 'Give me all the patients who got 10/13/21 before. ',
 'Give me all the patients who got 3/28/2022 before. ',
 'Give me all the patients who got 1/8/21 before. ',
 'Give me all the patients who got 10/12/21 before. ',
 'Give me all the patients who got 03/19/2021 before. ',
 'Give me all the patients who got 05/20/2021 before. ',
 'Give me all the patients who got 01/04/2021 before. ',
 'Give me all the patients who got 3/30/20 before. ',
 'Give me all the patients who got 10/29/2022 before. ',
 'Give me all the patients who got 12/20/2013 before. ',
 'Give me all the patients who got 4/2/21 before. ',
 'Give me all the patients who got 01/11/2021 before. '

In [53]:
q1 = list(questions.values())
q2 = [x['Question'] for x in temp]

In [54]:
out = q1 + q2

In [56]:
fout = open('questions22.json', "w")
json.dump(out, fout)
fout.close()

In [20]:
tokens = {'tokens': df.Tokens.tolist()}

In [17]:
fout = open("bio2021.json", "w")
json.dump(tokens, fout)
fout.close()