In [None]:
!pip install tensorboardX
!pip install transformers
!pip install jedi
!pip install simpletransformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
sys.path.append('/content/drive/MyDrive/Third-Year-Project')

In [4]:
import torch
torch.cuda.is_available()

True

Load Data

In [5]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
import sys
from seq2seq_model import Seq2SeqModel
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
from utils_metrics import get_entities_bio, f1_score, classification_report, precision_score, recall_score
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
import torch
import time
import math
import ast
from string import punctuation

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [7]:
train_path = '/content/drive/MyDrive/Third-Year-Project/data/train.csv'
dev_path = '/content/drive/MyDrive/Third-Year-Project/data/dev.csv'

train_data = pd.read_csv(train_path, sep=',').values.tolist()
train_df = pd.DataFrame(train_data, columns=["input_text", "target_text"])

eval_data = pd.read_csv(dev_path, sep=',').values.tolist()
eval_df = pd.DataFrame(eval_data, columns=["input_text", "target_text"])

Model

In [8]:
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 40,
    "train_batch_size": 16,
    "num_train_epochs": 5,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "evaluate_during_training": True,
    "evaluate_generated_text": True,
    "evaluate_during_training_verbose": True,
    "use_multiprocessing": False,
    "max_length": 25,
    "manual_seed": 4,
    "save_steps": 11898,
    # "weight_decay": 0.01,
    # "learning_rate":1e-4,
    "gradient_accumulation_steps": 1,
    "output_dir": "./exp/template",
}

In [9]:
# Initialize model
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
    use_cuda=True,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

In [10]:
# Train the model
model.train_model(train_df, eval_data=eval_df)

  0%|          | 0/12183 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/762 [00:00<?, ?it/s]



  0%|          | 0/2128 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/762 [00:00<?, ?it/s]

  0%|          | 0/2128 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/762 [00:00<?, ?it/s]

  0%|          | 0/2128 [00:00<?, ?it/s]

  0%|          | 0/2128 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/762 [00:00<?, ?it/s]

  0%|          | 0/2128 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/762 [00:00<?, ?it/s]

  0%|          | 0/2128 [00:00<?, ?it/s]

In [11]:
# Evaluate the model
results = model.eval_model(eval_df)

  0%|          | 0/2128 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/266 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/266 [00:00<?, ?it/s]

In [12]:
# Use the model for prediction

print(model.predict(["There is an increased risk of endometrial cancer in a woman with a uterus who uses unopposed estrogens."]))

['endometrial cancer is an adverse reaction entity']


Inference

In [13]:
class InputExample():
    def __init__(self, words, labels):
        self.words = words
        self.labels = labels

In [14]:
def template_entity(words, input_TXT, start):
    """
    This function predict whether the text span that inside the sentence is 'ADR' or 'O'

    Input: A list of candidate text spans, sentence, the index of the start of the span
    Output: A list: [the start index of the highest scored span, the end index of the highest scored span,
              predicted label for the span, the score of that span]
    """


    # input text -> template
    words_length = len(words)
    words_length_list = [len(i) for i in words]
    input_TXT = [input_TXT]*(2*words_length)


    input_ids = tokenizer(input_TXT, return_tensors='pt')['input_ids']
    model.to(device)
    template_list = [" is an adverse reaction entity .",  " is not a named entity ."]
    entity_dict = {0: 'ADR', 1: 'O'}
    temp_list = []

    for i in range(words_length):

        for j in range(len(template_list)):

            temp_list.append(words[i]+template_list[j])

    output_ids = tokenizer(temp_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
    output_ids[:, 0] = 2
    output_length_list = [0]*2*words_length


    for i in range(len(temp_list)//2):

        base_length = ((tokenizer(temp_list[i * 2], return_tensors='pt', padding=True, truncation=True)['input_ids']).shape)[1] - 4
        output_length_list[i*2:i*2+ 2] = [base_length]*2
        output_length_list[i*2+1] += 1

    score = [1]*2*words_length

    with torch.no_grad():

        output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids[:, :output_ids.shape[1] - 2].to(device))[0]

        for i in range(output_ids.shape[1] - 3):

            logits = output[:, i, :]
            logits = logits.softmax(dim=1)
            logits = logits.to('cpu').numpy()

            for j in range(0, 2*words_length):

                if i < output_length_list[j]:

                    score[j] = score[j] * logits[j][int(output_ids[j][i + 1])]
    

    end = start+(score.index(max(score))//2)

    return [start, end, entity_dict[(score.index(max(score))%2)], max(score)] #[start_index,end_index,label,score]

In [15]:
def prediction(input_TXT, n_span=3):
    """
    This function take the sentence, generate 1 to n span for each words in the sentence, and predict label for the spans.

    Input: sentence, maximum number of words in the span 
        (e.g. n_span = 3 means it will generate 1-gram to 3-gram for each word in the sentence)
    Output: A dictionary. The key is the start and end index of the span, the value is a list:
        [predicted label, score, span]
    """
    
    input_TXT_list = nltk.word_tokenize(input_TXT)
    input_TXT_list_strip = [x.strip() for x in input_TXT_list if x.strip() != '']

    predict = {}

    for i in range(len(input_TXT_list_strip)):

        words = []

        for j in range(1, min(n_span, len(input_TXT_list_strip) - i + 1)):

            word = (' ').join(input_TXT_list_strip[i:i+j])
            words.append(word)


        entity = template_entity(words, input_TXT, i) #[start_index,end_index,label,score]

        predict[(entity[0],entity[1])]= [entity[2],[entity[3]], ' '.join(input_TXT_list_strip[entity[0]:entity[1]+1])]

    return predict


In [16]:
def generate_true_label(label_list, n_span):
    """
     This function take the sentence, generate 1 to n span for each words in the sentence, 
     and generate true label for the spans. if the span only contains one word, only the 'B-ADR' annotated word will be
     considered as 'ADR', otherwise the span is 'O'. If the span has multiple words, only those start with 'B-ADR', followed 
     by a continous 'I-ADR' sequence will be condisered as 'ADR', otherwise 'O'

    Input: A list of sentence content. Each content includes: (words, word's BIO tag); maximum number of words in the span 
    Output: A dictionary. The key is the start and end index of the span. The key is [span, true label of the span]
    """

    true_labels = {}

    for i in range(len(label_list)):

        for j in range(1, min(n_span, len(label_list) - i + 1)):

            temp = label_list[i:i+j]

            if len(temp) == 1:

                if temp[0][1] == 'B-ADR'  and i == len(label_list)-1:

                    true_labels[(i,i)] = [temp[0][0],'ADR']

                elif temp[0][1] == 'B-ADR' and label_list[i+1][1] != 'I-ADR' :

                    true_labels[(i,i)] = [temp[0][0],'ADR']

                else:

                    true_labels[(i,i)] = [temp[0][0],'O']

            else:

                if temp[0][1] == 'B-ADR' and ('B-ADR' not in [temp[k][1] for k in range(1,len(temp))]
                                  and 'O' not in [temp[k][1] for k in range(1,len(temp))]):
                  
                    true_labels[(i, i+j-1)] = [' '.join([temp[k][0] for k in range(0,len(temp))]),'ADR']

                else:

                    true_labels[(i, i+j-1)] = [' '.join([temp[k][0] for k in range(0,len(temp))]),'O']

    return true_labels




In [17]:
def evaluation(true_list, predict_list, c):
    """
    This function predict the overall accuracy for the prediction

    Input: true label list, predict label list, the number of test sentence.
        Each content of true label list is a sub-list which contains the true label for a test sentence.
        Same as predict list.
    Output: Accuracy
    """

    count = 0
    correct = 0
    for i in range(0,c):

        for key, value in predict_list[i].items():

            if predict_list[i][key][0] == true_list[i][key][1]:

                correct += 1

            count += 1

    print(correct)
    print(count)
    return correct/count


In [18]:
def p_r_f1_evaluation(true_list, predict_list, c):
  """
  This function predict the precision, recall and F1 score for the prediction.

  Input: true label list, predict label list, the number of test sentence.
      Each content of true label list is a sub-list which contains the true label for a test sentence.
      Same as predict list.
  Output: precision, recall F1 score
  """

  tp = 0
  tn = 0
  fp = 0
  fn = 0

  for i in range(0,c):

    for key, value in predict_list[i].items():

      if predict_list[i][key][0] == 'ADR' and true_list[i][key][1] == 'ADR':

          tp += 1

      elif predict_list[i][key][0] == 'ADR' and true_list[i][key][1] == 'O':

          fp += 1
 
      elif predict_list[i][key][0] == 'O' and true_list[i][key][1] == 'ADR':
  
          fn += 1

      else:

          tn += 1

  print('True Positive: ', tp)
  print('False Positive: ', fp)
  print('False Negative: ', fn)
  print('True Negative: ', tn) 

  p =  tp/(tp+fp) 
  r =  tp/(tp+fn)
  f1 = (2*p*r)/(p+r)

  return p, r, f1 


In [19]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Training Results/2023-03-20-23-02/outputs/best_model')
model = BartForConditionalGeneration.from_pretrained('/content/outputs/best_model')

model.eval()
model.config.use_cache = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

file_path = 'drive/MyDrive/Third-Year-Project/data/test.csv' # file for testing

predict_labels = []
true_labels = []
n_span = 3

with open(file_path, "r", encoding="utf-8") as f:

    test_data = pd.read_csv(file_path).values.tolist()
    test_df = pd.DataFrame(test_data, columns=["input_text", "target_text", "BIO_tags"])  

    c=0
    true_list = []
    predict_list = []
  
  
    for index, row in test_df.iterrows():

          BIO_tag = row['BIO_tags']
          lst = ast.literal_eval(BIO_tag)

          if c % 20 == 0:

            print('Current',c)

          true_list.append(generate_true_label(lst, n_span))

          predict_list.append(prediction(row['input_text'], n_span))
 
          c += 1
  


Current 0
Current 20
Current 40
Current 60
Current 80
Current 100
Current 120
Current 140
Current 160
Current 180
Current 200
Current 220
Current 240
Current 260
Current 280
Current 300
Current 320
Current 340
Current 360
Current 380
Current 400
Current 420
Current 440
Current 460
Current 480
Current 500
Current 520
Current 540
Current 560
Current 580
Current 600
Current 620
Current 640
Current 660
Current 680
Current 700
Current 720
Current 740
Current 760
Current 780
Current 800
Current 820
Current 840
Current 860
Current 880
Current 900
Current 920


In [20]:
accuracy = evaluation(true_list, predict_list, c)
print(accuracy)
precision, recall, f1_score = p_r_f1_evaluation(true_list, predict_list, c)
print('p = ',precision, 'r = ',recall, 'f1 = ',f1_score)

35061
36257
0.967013266403729
True Positive:  4074
False Positive:  724
False Negative:  472
True Negative:  30987
p =  0.8491037932471863 r =  0.8961724593048834 f1 =  0.8720034246575342


Mapping to terminology

In [21]:
import json
import requests

In [22]:
def get_information(data):
  """
  This function take the json response and analyze it to get information

  Input: json object
  Output: A list: [Ontology type, Ontology value, confidence level, ontology_ids, evidence]
  """
  
  type = data['annotatedProperty']['propertyType']
  value = data['annotatedProperty']['propertyValue']
  confidence = data['confidence']
  ontology_ids = [k.split('/')[-1] for k in data['semanticTags']]
  evidences = data['provenance']['source']['uri']

  return [type, value, confidence, ontology_ids, evidences]


In [23]:
def retrieve(txt, n_span):
  """
  This function take the sentence, predict ADR words for the sentence, and map those words to ZOOMA ontology.

  Input: sentence, maximum number of words in the span 
  Output: A 2 dimension list. Each row represents a predicted ADR word.
  """

  predict_list = prediction(txt, n_span)
  table_list = []

  for key, value in predict_list.items():

    if predict_list[key][0] == 'ADR':
      words = predict_list[key][2]
      text_strings = [words]

      url = 'https://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate'
      params = {'propertyValue': text_strings}

      response = requests.get(url, params=params)

      if response.status_code == 200:
        # Process response data here
        j = response.json()
        for content in j:
          informations = get_information(content)
    
          for k in range(len(informations[3])):
            table_list.append([txt, words, informations[0], informations[1], informations[2], informations[3][k], informations[4]])
      else:
        print('Error getting data from API.')
        table_list.append([txt, words, 'Error', 'Error', 'Error', 'Error', 'Error']) 

  return table_list


In [24]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Training Results/2023-03-20-01-04/outputs/best_model')
model = BartForConditionalGeneration.from_pretrained('/content/outputs/best_model')

model.eval()
model.config.use_cache = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input = "Other ocular adverse reactions occurring in 1-5% of subjects included reduced visual acuity, punctate keratitis, eye inflammation, and iritis."

n_span = 4

table_list = retrieve(input,n_span) 

In [25]:
names = ['Text', 'Target term', 'Ontology term type', 'Ontology term value', 'confidence', 'Ontology Id', 'Source']

test = pd.DataFrame(columns = names,  data=table_list)
test.to_csv('Mapping_result.csv',index = False)
test

Unnamed: 0,Text,Target term,Ontology term type,Ontology term value,confidence,Ontology Id,Source
0,Other ocular adverse reactions occurring in 1-...,ocular adverse reactions,,Drug-Related Side Effects and Adverse Reactions,MEDIUM,OMIT_0028442,www.ebi.ac.uk/spot/zooma
1,Other ocular adverse reactions occurring in 1-...,ocular adverse reactions,,Assessment of adverse drug reactions (procedure),MEDIUM,396079007,www.ebi.ac.uk/spot/zooma
2,Other ocular adverse reactions occurring in 1-...,reduced visual acuity,disease,reduced visual acuity,HIGH,HP_0007663,www.ebi.ac.uk/spot/zooma
3,Other ocular adverse reactions occurring in 1-...,punctate keratitis,disease,Keratitis,MEDIUM,EFO_0009449,www.ebi.ac.uk/spot/zooma
4,Other ocular adverse reactions occurring in 1-...,punctate keratitis,disease,Autosomal dominant keratitis,MEDIUM,Orphanet_2334,www.ebi.ac.uk/spot/zooma
5,Other ocular adverse reactions occurring in 1-...,punctate keratitis,disease,Autosomal dominant keratitis,MEDIUM,148190,www.ebi.ac.uk/spot/zooma
6,Other ocular adverse reactions occurring in 1-...,punctate keratitis,disease,Autosomal dominant keratitis,MEDIUM,C1835698,www.ebi.ac.uk/spot/zooma
7,Other ocular adverse reactions occurring in 1-...,punctate keratitis,disease,Autosomal dominant keratitis,MEDIUM,MONDO_0007848,www.ebi.ac.uk/spot/zooma
8,Other ocular adverse reactions occurring in 1-...,punctate keratitis,disease,autosomal dominant keratitis,MEDIUM,MONDO_0007848,www.ebi.ac.uk/spot/zooma
9,Other ocular adverse reactions occurring in 1-...,eye inflammation,,eye inflammation,MEDIUM,EFO_0005752,www.ebi.ac.uk/spot/zooma
