# Intall Libraries, Run utlity functions, and import the queries (raw topics)

In [None]:
!pip install torch
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from google.colab import drive
drive.mount('/content/drive')
import torch
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pandas._libs.algos import diff_2d

tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")
model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_diseases_ner")

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

## Select the collection and its related topics to create the reformulated topics

In [None]:
collection = 'cds_clinical' #trec_clc for trec2021 clinical or cds_clinical for the remaining 3 collections as they share the same topics

if collection == 'trec_clc':
  path_to_topics = './experiments/topics/trec_clc/topics2021.txt'
  path_to_save_entities = './experiments/topics/trec_clc/extracted_med_entities/'

elif collection == 'cds_clinical':
  path_to_topics = './topics-2014_2015-description.topics'
  path_to_save_entities = './experiments/topics/cds_clinical/extracted_med_entities/'

else:
  raise Exception("Invalid selection of topics. Use appropriate collections (trec_clc for TREC 2021 clinical. cds_clinical for cds and clinical collections)")

## Loading the associated topics

In [None]:
# getting trec 2021 Data
def get_trec(path):
  topics = pd.read_csv(path)
  return topics

# parsing clinical and cds data
def parse_xml(path):
  with open(path, 'r') as document:
    d = document.read()
    soup = BeautifulSoup(d, 'xml')
    qid = soup.find_all('NUM')
    query = soup.find_all('TITLE')
    lq = []
    for i in qid: 
      lq.append(i.text)
    ld = []
    for x in query: 
      ld.append(x.text)
  desc_zu = pd.DataFrame({'qid': lq,'query': ld})
  return desc_zu

if collection =='trec_clc':
  topics = get_trec('/content/drive/MyDrive/data_colab/table_topics2021.csv')
  topics.head()
elif collection == 'cds_clinical':
  topics = parse_xml('/content/drive/MyDrive/data_colab/topics-2014_2015-description.topics')
  topics.head()

# BioBert Implementation

## Example

In [None]:
inputs = tokenizer(
    "The patient is a 24-year-old man who has had type 1 diabetes for 11 years. He presents to the emergency room with hyperglycemia and concern for possible diabetic ketoacidosis after not taking his insulin for 3 days. The patient reports that he is currently homeless and has lost his supply of insulin, syringes, glucometer, and glucose testing supplies. The patient states that at the time of his initial diagnosis with type 1 diabetes he was hospitalized with a glucose value >1000 mg/dL. At the time, he was experiencing polyuria, polydipsia, and polyphagia.", add_special_tokens=False, return_tensors="pt"

)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word

predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
print(predicted_tokens_classes)

## Implementation

In [None]:
def tok_to_words(tokens_list,labels_list):
  l = []
  f = False
  for i in range(len(tokens_list)):
    l.append((tokens_list[i],labels_list[i]))
  print(l)
  l2 = []
  s = ''
  for el in l:
    if 'B-DISEASE' in el[1]:
      f = True
      if el[0].startswith("##"):
        s += el[0][2:]
      else:
        s += " " + el[0]
    elif 'I-DISEASE' in el[1]:
      #print(el[0])
      if el[0].startswith("##"):
        s += el[0][2:]
      else:
        s += " " + el[0]
    elif el[0].startswith("##") and f == True:
      s += el[0][2:]
    else:
      if len(s) != 0:
        l2.append(s)
        s = ''
        f = False
  return l2

t_l = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
p_l = predicted_tokens_classes
#print(tok_to_words(t_l,p_l))

l1 = ['di','##arr',"##ea","po","##ly","##ps","and"]
l2 = ['B-DISEASE','I-DISEASE',"I-DISEASE","B-DISEASE",'I-DISEASE',"I-DISEASE","o"]
print(tok_to_words(l1,l2))

In [None]:
list_topics = topics['query'].to_list()
l_top_pred = []
l_dec = []
for el in list_topics:
  inputs = tokenizer(
    el, add_special_tokens=False, return_tensors="pt"
  )

  with torch.no_grad():
    logits = model(**inputs).logits
  predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word

  predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
  l_top_pred.append(predicted_tokens_classes)
  l_dec.append(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

In [None]:
li_top = []
for i in range(len(l_top_pred)):
  li_top.append(tok_to_words(l_dec[i],l_top_pred[i]))

In [None]:
print(li_top)

topics['DISEASE'] = pd.Series(li_top)
topics['DISEASE'] = topics['DISEASE'].apply(lambda y: np.nan if len(y)==0 else y)
topics.head()

In [None]:
path_to_save_entities = path_to_save_entities+'dis_chem_bio_bert'+'.csv'
topics.to_csv(path_to_save_entities,index = False)