In [None]:
# for references
#https://spacy.io/usage/processing-pipelines#sourced-components
#https://spacy.io/api/doc
#https://spacy.io/usage/training
#https://spacy.io/usage/saving-loading
# https://stackoverflow.com/questions/69181078/spacy-how-do-you-add-custom-ner-labels-to-a-pre-trained-model


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# installing Med7 (GLOVE and roberta embeddings) and it's related libraries
!python -m pip install jedi
!python -m pip install -U wheel pip setuptools pip install spacy==3.4.4 pip install spacy-transformers==1.1.9
!python -m pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl
!python -m pip install https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl

In [None]:
# DO NOT FORGET TO CHANGE THE PATHS AND THE FILES NAMES

# this code is to read bert data format (tokens, ner_tags, input_ids, attention_mask, labels)
# and change the data format to character offset for spacy ner model
# it generate  an excel file with all the tokens in the discharge summary that are labelled as entities only.
# you can edit the code to generate an excel file with all the tokens in the discharge summary, either an entity or not
# by uncommenting line No 34 "gold_data.append(gold_data_dic)"

import pandas as pd
def preprocess(df, name, path):
  gold_data_entities = []
  gold_data_tokens = []
  for i, tags in enumerate(df.ner_tags.to_list()):
    gold = []
    ch_start = 0
    for l, label in enumerate(tags):
      if label == "O":
        token = df._get_value(i, 'tokens')
        tmp_token = token[l]
        tmp_label = label
        tmp_start = l
        tmp_end = l+1

        gold_data_dic = {}
        ch_end = ch_start + len(tmp_token)
        gold_data_dic["file_id"] = i
        gold_data_dic['gold_label'] = tmp_label
        gold_data_dic['token_start'] = tmp_start
        gold_data_dic['token_end'] = tmp_end
        gold_data_dic['entity_text'] = tmp_token
        gold_data_dic['ch_start'] = ch_start
        gold_data_dic['ch_end'] = ch_end
        ch_start = ch_end+1
        #print(tmp_start, tmp_end, tmp_label, tmp_token)
        # uncomment the line below if you want to generate a file with all the tokens (entity or not)
        #gold_data_entities.append(gold_data_dic)
        gold_data_tokens.append(gold_data_dic)

      if label != "O":
        if "B-" in label:
          token = df._get_value(i, 'tokens')
          tmp_token = token[l]
          tmp_label = label[2:]
          tmp_start = l

          out = 0
          if l+1 < len(tags):
            tmp_end = l+1
            #print(l, l+1, tmp_label, tmp_token)
            for nl in range(l+1,len(tags)):
              #print(nl, token[nl])
              if "B-" in tags[nl]:
                out = 1
                break
              elif "O" == tags[nl]:
                out = 1
                break
              elif "I-" in tags[nl]:
                token = df._get_value(i, 'tokens')
                tmp_token += " " + token[nl]
                tmp_start = l
                tmp_end = nl+1
                #print(tmp_start, tmp_end, tmp_label, tmp_token)
            if out == 1:
              gold_data_dic = {}

              ch_end = ch_start + len(tmp_token)
              gold_data_dic["file_id"] = i
              gold_data_dic['gold_label'] = tmp_label
              gold_data_dic['token_start'] = tmp_start
              gold_data_dic['token_end'] = tmp_end
              gold_data_dic['entity_text'] = tmp_token
              gold_data_dic['ch_start'] = ch_start
              gold_data_dic['ch_end'] = ch_end
              ch_start = ch_end+1
              #print(tmp_start, tmp_end, tmp_label, tmp_token)
              gold_data_entities.append(gold_data_dic)
              gold_data_tokens.append(gold_data_dic)


  gold_data_entity = pd.DataFrame.from_records(gold_data_entities)
  gold_data_token = pd.DataFrame.from_records(gold_data_tokens)
  print(len(gold_data_entity), len(gold_data_token))
  print(gold_data_entity)
  print(gold_data_token)

  # this line to save the excel file of entities only
  gold_data_entity.to_excel(path+'gold_data_entity_CHoffsetEntitiesOnly_'+name+'.xlsx', index=False)

  # this line to save the excel file with all tokens
  gold_data_token.to_excel(path+'gold_data_entity_CHoffset_all_token_'+name+'.xlsx', index=False)




path = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"

train_validation = pd.read_json(path+"train_validation_429.json", lines=True)
print("pre-processing train_validation set")
preprocess(train_validation, 'train_validation_429', path)

train = pd.read_json(path+"train_353.json", lines=True)
print("pre-processing training set")
preprocess(train, 'train_353', path)

validation = pd.read_json(path+"validation_76.json", lines=True)
print("pre-processing validation set")
preprocess(validation, 'validation_76', path)

test = pd.read_json(path+"test_76.json", lines=True)
print("pre-processing testing set")
preprocess(test, 'test_76', path)



In [None]:
# test the results of en_core_med7_trf and en_core_med7_lg on testing set WITHOUT fine-tuining

# 1- create one discharge summary with its NER labels
# 2- send the summary to Med7 for prediction NER labels
# 3- evaluate the results with Gold standard

import pandas as pd
import spacy


def testing(df, name, path):

  med7 = spacy.load(name)

  str_dic_lg = []
  predict_labels_lg_dic = []
  predict_labels_lg = []
  for i, token in enumerate(df.tokens.to_list()):
    token_str_lg = ' '.join(str(t) for t in token)
    labels_lg = df._get_value(i, 'ner_tags')
    str_dic_lg.append([token_str_lg, labels_lg])
    predicts_lg = []
    entities = med7(token_str_lg)
    for e in entities.ents:
      predict_dic = {}
      predict_dic["predict_file_id"] = i
      predict_dic['predict_label'] = e.label_
      predict_dic['predict_start'] = e.start
      predict_dic['predict_end'] = e.end
      predict_dic['predict_text'] = e.text
      predict_dic['start_char'] = e.start_char
      predict_dic['end_char'] = e.end_char

      #print("e.text", e.text)
      #print("e.label_", e.label_)
      #print('start', e.start)
      #print('end', e.end)
      #print('char_span', e.char_span(e.start_char, e.end_char))
      #print('start_char', e.start_char)
      #print('end_char', e.end_char)
      #print('ent_id', e.ent_id)
      #print('ent_id_', e.ent_id_)
      #print('ents', e.ents[0])
      #print('label', e.label)
      #print('id', e.id)
      #print('id_', e.id_)

      predict_labels_lg_dic.append(predict_dic)
      predicts_lg.append([e.start, e.end, e.text, e.label_])
    print(predict_dic)
    predict_labels_lg.append(predicts_lg)

  print(len(predict_labels_lg))
  print(len(predict_labels_lg_dic))
  predict_label_entity = pd.DataFrame.from_records(predict_labels_lg_dic)
  print(predict_label_entity)

  # uncomment the line below to save the output of MED7 prediction
  predict_label_entity.to_excel(path + 'predict_label_entity_76testDataset_'+name+'.xlsx', index=False)


path = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"
df = pd.read_json(path+"test_76.json", lines=True)
model = ["en_core_med7_lg", "en_core_med7_trf"]
for name in model:
  print("testing the performance of "+name+" over the testing set")
  testing(df, name, path)


In [None]:
# this code is to save the results of prediction in suitable format to calculate the confusion matrix of TP, FN, FP, TN
# using type match (at least part of the token text is annotated with the correct entity type)
# and using strict match (the token text and the entity type has to be matched the gold data)
# COR: correct annotation of type
# INC: incorrect annotation of type
# MIS: missing annotation by Med7
# SPU: Spurius is for a token predicted by Med7 with an entity label but it's not in the gold data

# see this website explains NER evaluation:
# https://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/


import pandas as pd

def processing(true_label_entity, predict_label_entity, name, path):

  Eval = []

  predict_label_entity_len = len(predict_label_entity)
  type_list = []
  strict_list = []
  for i, row in true_label_entity.iterrows():
    if i%500 == 0:

      print("batch: ", i)
    tmp = 0
    for c, srow in predict_label_entity.iterrows():
      #print(c)
      if srow[0] > row[0]:
        #print('LARGER', row[0], srow[0])
        break
      #if c%5000 == 0:
        #print("batch c", c)
      Eval_dic = {}
      if row[0] == srow[0]:
        #print('EQUALS', row[0], srow[0])
        if row[5] == srow[5] and row[6] == srow[6] and str(row[4]) == str(srow[4]):
        #if str(row[4]).lower() == str(srow[4]).lower():
          if str(row[1]).lower() == str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'COR'
            Eval_dic['type_label'] = 'COR'
            #print('str(row[4]).lower() == str(srow[4]).lower()', Eval_dic)
            Eval.append(Eval_dic)

            strict_list.append('COR')
            type_list.append('COR')
            #print("strict_list.append(COR), type_list.append(COR)")
            #true_label_entity = true_label_entity.drop([i])
            #predict_label_entity = predict_label_entity.drop([c])
            #predict_label_entity_len -= 1
            #tmp = 1
            #break
          elif str(row[1]).lower() != str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'INC'
            #print('str(row[4]).lower() == str(srow[4]).lower()', Eval_dic)
            Eval.append(Eval_dic)

            strict_list.append('INC')
            type_list.append('INC')
            #print("strict_list.append(INC), type_list.append(INC)")
          true_label_entity = true_label_entity.drop([i])
          predict_label_entity = predict_label_entity.drop([c])
          predict_label_entity_len -= 1
          tmp = 1
          break

        elif row[5] == srow[5] and str(row[4]) in str(srow[4]):
          if str(row[1]).lower() == str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'COR'
            #print('row[2] <= srow[2]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('COR')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(COR)")

          elif str(row[1]).lower() != str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'INC'
            #print('row[2] <= srow[2]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('INC')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(INC)")
          #predict_label_entity = predict_label_entity.drop([c])
          #predict_label_entity_len -= 1
          #if row[3] < srow[3]:
          true_label_entity = true_label_entity.drop([i])
          tmp = 1


        elif row[6] == srow[6] and str(row[4]) in str(srow[4]):
          if str(row[1]).lower() == str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'COR'
            #print('row[3] <= srow[3]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('COR')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(COR)")
              #true_label_entity = true_label_entity.drop([i])

              #break
          elif str(row[1]).lower() != str(srow[1]).lower():
              #print("equals", i, c)
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'INC'
            #print('row[3] <= srow[3]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('INC')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(INC)")
              #true_label_entity = true_label_entity.drop([i])
          predict_label_entity = predict_label_entity.drop([c])
          predict_label_entity_len -= 1
          true_label_entity = true_label_entity.drop([i])
              #break
          tmp = 1



    if tmp == 0:
      if i in true_label_entity.index:
        Eval_dic = {}

        Eval_dic['file_id'] = str(row[0])
        Eval_dic['true_label'] = str(row[1]).lower()
        Eval_dic['true_start'] = row[5]
        Eval_dic['true_end'] = row[6]
        Eval_dic['true_text'] = str(row[4])
        Eval_dic['predict_file_id'] = str(srow[0])
        Eval_dic['predict_label'] = "O"
        Eval_dic['predict_start'] = row[5]
        Eval_dic['predict_end'] = row[6]
        Eval_dic['predict_text'] = str(row[4])
        Eval_dic['strict_label'] = 'MIS'
        Eval_dic['type_label'] = 'MIS'
        Eval.append(Eval_dic)

        #print(row)
        strict_list.append('MIS')
        type_list.append('MIS')
        #print("strict_list.append(MIS), type_list.append(MIS)")
        true_label_entity = true_label_entity.drop([i])

    #print(len(predict_label_entity), predict_label_entity_len)
  for c, srow in predict_label_entity.iterrows():
    #if len(predict_label_entity) > predict_label_entity_len:
    Eval_dic = {}
    Eval_dic['file_id'] = str(srow[0])
    Eval_dic['true_label'] = 'O'
    Eval_dic['true_start'] = srow[5]
    Eval_dic['true_end'] = srow[6]
    Eval_dic['true_text'] = str(srow[4])
    Eval_dic['predict_file_id'] = str(srow[0])
    Eval_dic['predict_label'] = str(srow[1]).lower()
    Eval_dic['predict_start'] = srow[5]
    Eval_dic['predict_end'] = srow[6]
    Eval_dic['predict_text'] = str(srow[4])
    Eval_dic['strict_label'] = 'SPU'
    Eval_dic['type_label'] = 'SPU'
    Eval.append(Eval_dic)
    #print(srow)
    strict_list.append('SPU')
    type_list.append('SPU')
    #print("strict_list.append(SPU), type_list.append(SPU)")
    predict_label_entity = predict_label_entity.drop([c])
    predict_label_entity_len -= 1

  true_predict_eval = pd.DataFrame.from_records(Eval)
  print(len(true_predict_eval))
  # uncomment this line to save the file, DO NOT FORGET TO CHANGE PATH AND FILE NAME
  true_predict_eval.to_excel(path+'true_predict_label_entity_76testDataset_'+name+'.xlsx', index=False)  #true_predict_label_entity_76testDataset_en_core_med7_trf or true_predict_label_entity_76testDataset_en_core_med7_lg


path = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"

true_label_entity = pd.read_excel(path+'gold_data_entity_CHoffsetEntitiesOnly_test_76.xlsx')
print(true_label_entity)

model = ["en_core_med7_lg", "en_core_med7_trf"]
for name in model:
  print("processing the output of "+name+" predictions over the testing set")
  predict_label_entity = pd.read_excel(path + 'predict_label_entity_76testDataset_'+name+'.xlsx') #predict_label_entity_76testDataset_en_core_med7_trf or predict_label_entity_76testDataset_en_core_med7_lg
  print(predict_label_entity)
  processing(true_label_entity, predict_label_entity, name, path)




In [None]:

# This code is to calculate P, R, F1 scores based on matching the entity type between reference and candidate.
# support in the output is the number of reference per entity type.

import pandas as pd
from sklearn.metrics import classification_report
def class_report(y_true, y_pred, path, name):

  file = open(path+"prediction_results_without_fine-tuning_"+name+".txt", 'w')
  labels = ['reason', 'ade', 'form', 'strength', 'dosage', 'drug', 'route', 'frequency', 'duration']
  print(type(labels), labels)
  report = classification_report(y_true, y_pred, labels = labels)
  print(report)
  file.writelines(report)

  labels.remove('reason') # remove 'reason' label from evaluation
  labels.remove('ade') # remove 'reason' label from evaluation
  print(type(labels), labels)
  report = classification_report(y_true, y_pred, labels = labels)
  print(report)
  file.writelines(report)
  file.close()


  #return report


path = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"
model = ["en_core_med7_lg", "en_core_med7_trf"]
for name in model:
  print("caluculating P, R, F1 scores of "+name+" predictions over the testing set")
  eval_report_all = pd.read_excel(path+'true_predict_label_entity_76testDataset_'+name+'.xlsx') #predict_label_entity_76testDataset_en_core_med7_trf or predict_label_entity_76testDataset_en_core_med7_lg
  print("76 Test dataset "+name)
  eval_report_copy_all = eval_report_all.copy()
  y_true = eval_report_copy_all['true_label'].tolist()
  y_pred = eval_report_copy_all['predict_label'].tolist()
  print('type matching')
  #all = class_report(y_true, y_pred)
  class_report(y_true, y_pred, path, name)



In [None]:
# this code is to fine-tune Med7 with two versions
#en_core_med7_trf en_core_med7_lg

import random
import pandas as pd
import spacy
from spacy import util
from spacy.tokens import Doc
from spacy.training import Example
from spacy.tokens import DocBin
from spacy.language import Language


def customizing_pipeline_component(path, nlp: Language, offset, name):
    file = open(path+"loss_log.txt", "w")

    #optimizer = nlp.create_optimizer()
    optimizer = nlp.resume_training()
    print(type(offset), len(offset))
    print("   Training ...")
    # setup the number of iterations here
    iter = 30
    file.writelines("post-training " + name + "for " + str(iter) + "iterations\n")
    for _ in range(iter):
        print("iteration: " + str(_))
        random.shuffle(offset)
        losses = {}
        for raw_text, entity_offsets in offset: # add character indexes
            entities = spacy.training.offsets_to_biluo_tags(nlp.make_doc(raw_text), entity_offsets)
            #print(entities)
            doc = nlp.make_doc(raw_text)
            example = Example.from_dict(doc, {"entities": entity_offsets})
            #print('[example]', len([example]))
            nlp.update([example], sgd=optimizer, losses=losses)
        print(_, losses)
        file.writelines("iteration: "+ str(_) + str(losses)+"\n")
    file.close()
    # save the post-trained model
    nlp.to_disk(path + name +"_plus")

    # Result after training
    print(f"Result AFTER training:")
    df = pd.read_json(path + "test_76.json", lines=True)

    predict_labels_lg_dic = []

    for i, token in enumerate(df.tokens.to_list()):
      token_str_lg = ' '.join(str(t) for t in token)

      entities = nlp(token_str_lg)
      for e in entities.ents:
        predict_dic = {}
        predict_dic["predict_file_id"] = i
        predict_dic['predict_label'] = e.label_
        predict_dic['predict_start'] = e.start
        predict_dic['predict_end'] = e.end
        predict_dic['predict_text'] = e.text
        predict_dic['start_char'] = e.start_char
        predict_dic['end_char'] = e.end_char

        predict_labels_lg_dic.append(predict_dic)
      print('file_id:', i)

    print(len(predict_labels_lg_dic))
    predict_label_entity = pd.DataFrame.from_records(predict_labels_lg_dic)
    print(predict_label_entity)

    # uncomment to save the output of the prediction after fine-tuning MED7
    predict_label_entity.to_excel(path + 'test_76_'+name+'_fine_tuned_'+str(iter)+'iterations.xlsx', index=False)

def main():
    input_dir = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"
    print("read data")
    json = pd.read_json(input_dir + "train_validation_429.json", lines=True)
    excel = pd.read_excel(input_dir + 'gold_data_entity_CHoffsetEntitiesOnly_train_validation_429.xlsx')
    input_annotations_all = []
    print("processing data")
    for i, token in enumerate(json.tokens.to_list()):
      input_annotations = []
      # create one discharge summary with its NER labels
      token_str_lg = ' '.join(str(t) for t in token)
      file_id =  excel[excel['file_id'] == i]
      for i, row in file_id.iterrows():
        input_annotations.append((int(row[5]), int(row[6]), row[1])) #(start, end, label)
      input_annotations_all.append([token_str_lg, input_annotations])
    print("post-training en_core_med7_lg")
    med7 = spacy.load("en_core_med7_lg")
    customizing_pipeline_component(input_dir, med7, input_annotations_all, "en_core_med7_lg")

    # uncomment th elines below to fine-tune "en_core_med7_trf"
    print("post-training en_core_med7_trf")
    med7 = spacy.load("en_core_med7_trf")
    customizing_pipeline_component(input_dir, med7, input_annotations_all, "en_core_med7_trf")

if __name__ == '__main__':
    main()

In [None]:
# same code as above
# this code is to save the results of prediction in suitable format to calculate the confusion matrix of TP, FN, FP, TN
# using type match (at least part of the token text is annotated with the correct entity type)
# and using strict match (the token text and the entity type has to be matched the gold data)


import pandas as pd

def processing(true_label_entity, predict_label_entity, name, path):

  Eval = []

  predict_label_entity_len = len(predict_label_entity)
  type_list = []
  strict_list = []
  for i, row in true_label_entity.iterrows():
    if i%500 == 0:

      print("batch: ", i)
    tmp = 0
    for c, srow in predict_label_entity.iterrows():
      #print(c)
      if srow[0] > row[0]:
        #print('LARGER', row[0], srow[0])
        break
      #if c%5000 == 0:
        #print("batch c", c)
      Eval_dic = {}
      if row[0] == srow[0]:
        #print('EQUALS', row[0], srow[0])
        if row[5] == srow[5] and row[6] == srow[6] and str(row[4]) == str(srow[4]):
        #if str(row[4]).lower() == str(srow[4]).lower():
          if str(row[1]).lower() == str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'COR'
            Eval_dic['type_label'] = 'COR'
            #print('str(row[4]).lower() == str(srow[4]).lower()', Eval_dic)
            Eval.append(Eval_dic)

            strict_list.append('COR')
            type_list.append('COR')
            #print("strict_list.append(COR), type_list.append(COR)")
            #true_label_entity = true_label_entity.drop([i])
            #predict_label_entity = predict_label_entity.drop([c])
            #predict_label_entity_len -= 1
            #tmp = 1
            #break
          elif str(row[1]).lower() != str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'INC'
            #print('str(row[4]).lower() == str(srow[4]).lower()', Eval_dic)
            Eval.append(Eval_dic)

            strict_list.append('INC')
            type_list.append('INC')
            #print("strict_list.append(INC), type_list.append(INC)")
          true_label_entity = true_label_entity.drop([i])
          predict_label_entity = predict_label_entity.drop([c])
          predict_label_entity_len -= 1
          tmp = 1
          break

        elif row[5] == srow[5] and str(row[4]) in str(srow[4]):
          if str(row[1]).lower() == str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'COR'
            #print('row[2] <= srow[2]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('COR')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(COR)")

          elif str(row[1]).lower() != str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'INC'
            #print('row[2] <= srow[2]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('INC')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(INC)")
          #predict_label_entity = predict_label_entity.drop([c])
          #predict_label_entity_len -= 1
          #if row[3] < srow[3]:
          true_label_entity = true_label_entity.drop([i])
          tmp = 1


        elif row[6] == srow[6] and str(row[4]) in str(srow[4]):
          if str(row[1]).lower() == str(srow[1]).lower():
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'COR'
            #print('row[3] <= srow[3]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('COR')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(COR)")
              #true_label_entity = true_label_entity.drop([i])

              #break
          elif str(row[1]).lower() != str(srow[1]).lower():
              #print("equals", i, c)
            Eval_dic['file_id'] = str(row[0])
            Eval_dic['true_label'] = str(row[1]).lower()
            Eval_dic['true_start'] = row[5]
            Eval_dic['true_end'] = row[6]
            Eval_dic['true_text'] = str(row[4])
            Eval_dic['predict_file_id'] = str(srow[0])
            Eval_dic['predict_label'] = str(srow[1]).lower()
            Eval_dic['predict_start'] = srow[5]
            Eval_dic['predict_end'] = srow[6]
            Eval_dic['predict_text'] = str(srow[4])
            Eval_dic['strict_label'] = 'INC'
            Eval_dic['type_label'] = 'INC'
            #print('row[3] <= srow[3]', Eval_dic)
            Eval.append(Eval_dic)

            type_list.append('INC')
            strict_list.append('INC')
            #print("strict_list.append(INC), type_list.append(INC)")
              #true_label_entity = true_label_entity.drop([i])
          predict_label_entity = predict_label_entity.drop([c])
          predict_label_entity_len -= 1
          true_label_entity = true_label_entity.drop([i])
              #break
          tmp = 1



    if tmp == 0:
      if i in true_label_entity.index:
        Eval_dic = {}

        Eval_dic['file_id'] = str(row[0])
        Eval_dic['true_label'] = str(row[1]).lower()
        Eval_dic['true_start'] = row[5]
        Eval_dic['true_end'] = row[6]
        Eval_dic['true_text'] = str(row[4])
        Eval_dic['predict_file_id'] = str(srow[0])
        Eval_dic['predict_label'] = "O"
        Eval_dic['predict_start'] = row[5]
        Eval_dic['predict_end'] = row[6]
        Eval_dic['predict_text'] = str(row[4])
        Eval_dic['strict_label'] = 'MIS'
        Eval_dic['type_label'] = 'MIS'
        Eval.append(Eval_dic)

        #print(row)
        strict_list.append('MIS')
        type_list.append('MIS')
        #print("strict_list.append(MIS), type_list.append(MIS)")
        true_label_entity = true_label_entity.drop([i])

    #print(len(predict_label_entity), predict_label_entity_len)
  for c, srow in predict_label_entity.iterrows():
    #if len(predict_label_entity) > predict_label_entity_len:
    Eval_dic = {}
    Eval_dic['file_id'] = str(srow[0])
    Eval_dic['true_label'] = 'O'
    Eval_dic['true_start'] = srow[5]
    Eval_dic['true_end'] = srow[6]
    Eval_dic['true_text'] = str(srow[4])
    Eval_dic['predict_file_id'] = str(srow[0])
    Eval_dic['predict_label'] = str(srow[1]).lower()
    Eval_dic['predict_start'] = srow[5]
    Eval_dic['predict_end'] = srow[6]
    Eval_dic['predict_text'] = str(srow[4])
    Eval_dic['strict_label'] = 'SPU'
    Eval_dic['type_label'] = 'SPU'
    Eval.append(Eval_dic)
    #print(srow)
    strict_list.append('SPU')
    type_list.append('SPU')
    #print("strict_list.append(SPU), type_list.append(SPU)")
    predict_label_entity = predict_label_entity.drop([c])
    predict_label_entity_len -= 1

  true_predict_eval = pd.DataFrame.from_records(Eval)
  print(len(true_predict_eval))
  # if you changed the number of iterations to other than 30 change it in the line below
  true_predict_eval.to_excel(path+'true_predict_label_entity_76testDataset_'+name+'_fine_tuned_30iterations.xlsx', index=False)  #true_predict_label_entity_76testDataset_en_core_med7_trf or true_predict_label_entity_76testDataset_en_core_med7_lg


path = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"

true_label_entity = pd.read_excel(path+'gold_data_entity_CHoffsetEntitiesOnly_test_76.xlsx')
print(true_label_entity)

model = ["en_core_med7_lg", "en_core_med7_trf"]
for name in model:
  print("processing the output of "+name+" predictions over the testing set")

  # make sure the number of iterations is correct in the file name
  # if you changed the number of iterations to other than 30 change it in the line below
  predict_label_entity = pd.read_excel(path + 'test_76_'+name+'_fine_tuned_30iterations.xlsx') #predict_label_entity_76testDataset_en_core_med7_trf or predict_label_entity_76testDataset_en_core_med7_lg
  print(predict_label_entity)
  processing(true_label_entity, predict_label_entity, name, path)





In [None]:
# same code as above
# This code is to calculate P, R, F1 scores based on matching the entity type between reference and candidate.
# support in the output is the number of reference per entity type.

# DONOT FORGET TO CHANGE PATHS AND FILES NAMES

import pandas as pd
from sklearn.metrics import classification_report


def class_report(y_true, y_pred, path, name):

  file = open(path+"prediction_results_after_fine-tuning_"+name+".txt", 'w')
  labels = ['reason', 'ade', 'form', 'strength', 'dosage', 'drug', 'route', 'frequency', 'duration']
  print(type(labels), labels)
  report = classification_report(y_true, y_pred, labels = labels)
  print(report)
  file.writelines(report)

  labels.remove('reason') # remove 'reason' label from evaluation
  labels.remove('ade') # remove 'reason' label from evaluation
  print(type(labels), labels)
  report = classification_report(y_true, y_pred, labels = labels)
  print(report)
  file.writelines(report)
  file.close()


  #return report


path = "/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/Train-validation-test/August2023/"
model = ["en_core_med7_lg", "en_core_med7_trf"]
for name in model:
  print("caluculating P, R, F1 scores of "+name+" predictions over the testing set AFTER fine-tuning on 429 n2c2-2018 training set with 30 iterations")
  eval_report_all = pd.read_excel(path+'true_predict_label_entity_76testDataset_'+name+'_fine_tuned_30iterations.xlsx') #predict_label_entity_76testDataset_en_core_med7_trf or predict_label_entity_76testDataset_en_core_med7_lg
  print("76 Test dataset "+name)
  eval_report_copy_all = eval_report_all.copy()
  y_true = eval_report_copy_all['true_label'].tolist()
  y_pred = eval_report_copy_all['predict_label'].tolist()
  print('type matching')
  #all = class_report(y_true, y_pred)
  class_report(y_true, y_pred, path, name)



In [None]:
# All code below is for pre-process Brat annotaion (NER)
# skip if you don't need

In [None]:
!python /content/drive/MyDrive/Reasearch_Assistantship/HILO_project/brat2CoNLL-main/brat2CoNLL/format_convertor2.py --input_dir=/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/train --output_file=/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/train/output/output.txt
!python /content/drive/MyDrive/Reasearch_Assistantship/HILO_project/brat2CoNLL-main/brat2CoNLL/format_convertor2.py --input_dir=/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/valid --output_file=/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/valid/output/output.txt
!python /content/drive/MyDrive/Reasearch_Assistantship/HILO_project/brat2CoNLL-main/brat2CoNLL/format_convertor2.py --input_dir=/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/test --output_file=/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/test/output/output.txt

In [None]:
# Edit pre-process
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/train/output/output.xlsx')
print(df.columns)

df_copy = df.copy()
line = df_copy.index[df_copy['Unnamed: 2'] == "O"].tolist()
print(len(df_copy))
print(len(line))
df2 = pd.DataFrame(columns = ['Token', 'Label', 'Unnamed: 2'])
for id in line:
  tmp = df_copy.iloc[id]
  #print(tmp[2])

  test = pd.DataFrame({'Token' : tmp[0], 'Label' : tmp[2], 'Unnamed: 2' : pd.NA}, index=[id])
  df2 = pd.concat([df2, test], axis=0, ignore_index=False)
  #df2 = df2.append(test, ignore_index=False)
  test = pd.DataFrame({'Token' : tmp[1], 'Label' : tmp[2], 'Unnamed: 2' : pd.NA}, index=[id+1])
  df2 = pd.concat([df2, test], axis=0, ignore_index=False)
  #df2 = df2.append(test, ignore_index=False)

  #print(tmp)
print(len(df2))
print(df_copy.loc[[29490]])
df_copy = df_copy.drop(line)
print(len(df_copy))
line = df2.iloc[0]
print(line)
line = df2.iloc[1]
print(line)
# https://huggingface.co/course/chapter7/2
df3 = pd.concat([df2, df_copy], axis=0)
print(df3)
df3.to_excel('/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/train/output/output2.xlsx', index=False)


In [None]:
# split all discharge sammaries into seperate files, a file for a discharge summary

df = pd.read_excel('/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/valid/output/output.xlsx', index=False)
print(len(df))
print(df.columns)

df_copy = df.copy()
line = []
for i, row in df_copy.iterrows():
  #print(i, row[0], row[1])
  if ".txt" in row[1]:
    print(i, row[0], row[1])
    line.append(i)
print(len(line))
print(line)
i = 0
while i+1 <= len(line):
  if i+1 == len(line):
    df_slice = df_copy.iloc[line[i]:len(df_copy), :]
    txt_ind = df_slice["Label"].tolist()
    print(txt_ind[0])
    df_slice.to_excel('/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/valid/output/'+txt_ind[0]+'.xlsx', index=False)
  else:
    df_slice = df_copy.iloc[line[i]:line[i+1], :]
    txt_ind = df_slice["Label"].tolist()
    print(txt_ind[0])
    df_slice.to_excel('/content/drive/MyDrive/Reasearch_Assistantship/HILO_project/n2c2_2018/valid/output/'+txt_ind[0]+'.xlsx', index=False)
  i +=1