In [1]:
import os
from tqdm.notebook import tqdm
import glob
import numpy as np
import pandas as pd
import random
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize

# Read files and annotations

In [3]:
#This class is provided by i2b2 challenge. I use its functions to get annotations and text

class RecordTrack2(object):
    """Record for Track 2 class."""

    def __init__(self, file_path):
        """Initialize."""
        self.path = os.path.abspath(file_path)
        self.basename = os.path.basename(self.path)
        self.annotations = self._get_annotations()
        # self.text = self._get_text()

    @property
    def tags(self):
        return self.annotations['tags']

    @property
    def relations(self):
        return self.annotations['relations']

    def _get_annotations(self):
        """Return a dictionary with all the annotations in the .ann file."""
        annotations = defaultdict(dict)
        with open(self.path) as annotation_file:
            lines = annotation_file.readlines()
            for line_num, line in enumerate(lines):
                if line.strip().startswith('T'):
                    try:
                        tag_id, tag_m, tag_text = line.strip().split('\t')
                    except ValueError:
                        print(self.path, line)
                    if len(tag_m.split(' ')) == 3:
                        tag_type, tag_start, tag_end = tag_m.split(' ')
                    elif len(tag_m.split(' ')) == 4:
                        tag_type, tag_start, _, tag_end = tag_m.split(' ')
                    elif len(tag_m.split(' ')) == 5:
                        tag_type, tag_start, _, _, tag_end = tag_m.split(' ')
                    else:
                        print(self.path)
                        print(line)
                    tag_start, tag_end = int(tag_start), int(tag_end)
                    annotations['tags'][tag_id] = (tag_id, tag_start ,tag_end, tag_type, tag_text)
            for line_num, line in enumerate(lines):
                if line.strip().startswith('R'):
                    rel_id, rel_m = line.strip().split('\t')
                    rel_type, rel_arg1, rel_arg2 = rel_m.split(' ')
                    rel_arg1 = rel_arg1.split(':')[1]
                    rel_arg2 = rel_arg2.split(':')[1]
                    arg1 = annotations['tags'][rel_arg1]
                    arg2 = annotations['tags'][rel_arg2]
                    annotations['relations'][rel_id] = (arg1,arg2, rel_type)
        return annotations

    def get_text(self):
        """Return the text in the corresponding txt file."""
        path = self.path.replace('.ann', '.txt')
        with open(path) as text_file:
            text = text_file.read()
        return text


In [4]:
#reading data and create a list of RecordTrack2 class for each file
folder1 = 'training_20180910/'
anno = []
files1 = set([os.path.basename(f) for f in glob.glob(
    os.path.join(folder1, '*.ann'))])
for file in files1:
    anno.append(RecordTrack2(os.path.join(folder1, file)))

# Generate entities within sentence

Find the offset of each entity within that sentence. 

The offset provided by i2b2 is aginst the whole file.

In this section, we will locate the entity, tokenise sentense, and remove extra spaces

The offset I generated in this section is word offset (created by len(text.split()), feel free to change to character offset if needed. 


In [5]:
#function to generate the first entity in a sentence
def first_entity(text, start, end, entity):
        text_before = sent_tokenize(text[:start]) #all the text befere the start of the entity
        sent_num = len(text_before)
        word_before = text_before[-1].split() #words before entity in the current sentence 
        word_start = len(word_before) #start position
        word_end = word_start+len(entity.split()) #end position
        sent_after = sent_tokenize(text[end:])[0] #the left of current sentence
        sent_piece = ' '.join(word_before + entity.split()) #current sentence until the end of first entity, extra spaces removed by .split()
        return word_start, word_end, sent_after, sent_piece

In [6]:
#function to generate non-first entity in a sentence, same logic sa first_entity function
def entity_within_sent(text, word_end_before, end_before, start, end, entity):
    word_between = text[end_before:start].split()
    word_start = word_end_before + len(word_between)
    word_end = word_start+len(entity.split())
    sent_after = sent_tokenize(text[end:])[0]
    #print(sent_after)
    sent_piece = ' '.join(word_between + entity.split())
    return word_start, word_end, sent_after, sent_piece

In [8]:
tags_list = []
sents_dict = {}
for i, an in enumerate(tqdm(anno)):
    text = an.get_text()
    tags = pd.DataFrame(an.tags.values()).sort_values(1).values
    #skip used to mark how many entities we have processed within the previous sentence
    skip = 0
    for j, (en_id, start, end, concept, entity) in enumerate(tags):
        #we skip the entities we have processed from last sentence, it will bring us to the first entity in current sentence
        if skip>0:
            skip -= 1
            continue
        #remove entities that across sentences
        if len(sent_tokenize(text[start:end]))>1:
            continue
        word_start, word_end, sent_after, sent_piece = first_entity(text, start, end, entity)
        #'-'.join([str(i),str(j)]) is an unique idenfitier for each sentence in the dataset, 
        #i identify the document and j identify the sentence within the document
        tags_list.append([i, en_id, word_start, word_end, concept, entity, '-'.join([str(i),str(j)])])
        while j+skip+1<len(tags) and end+len(sent_after)>= tags[j+skip+1][1]:
            anno_curr = tags[j+skip+1]
            word_end_before = word_end
            end_before = end
            sent_piece_before = sent_piece
            en_id = anno_curr[0]
            start = anno_curr[1]
            end = anno_curr[2]
            concept = anno_curr[3]
            entity = anno_curr[4]
            word_start, word_end, sent_after, sent_piece = entity_within_sent(text, word_end_before, end_before, start, end, entity)
            sent_piece = ' '. join([sent_piece_before, sent_piece])
            tags_list.append([i, en_id, word_start, word_end, concept, entity, '-'.join([str(i),str(j)])])
            skip += 1
        sents_dict['-'.join([str(i),str(j)])] = sent_piece

  0%|          | 0/303 [00:00<?, ?it/s]

In [9]:
tags_list

[[0, 'T29', 12, 13, 'Reason', 'headache', '0-0'],
 [0, 'T12', 18, 19, 'Drug', 'ibuprofen', '0-0'],
 [0, 'T31', 3, 4, 'Reason', 'HIV', '0-2'],
 [0, 'T30', 5, 6, 'Drug', 'HAART', '0-3'],
 [0, 'T7', 15, 16, 'Drug', 'Bactrim', '0-4'],
 [0, 'T32', 17, 20, 'Reason', 'Pneumocystis carinii prophylaxis', '0-4'],
 [0, 'T3', 4, 5, 'Drug', 'Bactrim', '0-6'],
 [0, 'T6', 6, 7, 'Drug', 'acyclovir', '0-6'],
 [0, 'T46', 5, 6, 'Drug', 'potassium', '0-8'],
 [0, 'T47', 13, 14, 'Drug', 'creatinine', '0-8'],
 [0, 'T45', 15, 16, 'Drug', 'glucose', '0-8'],
 [0, 'T33', 10, 11, 'Route', 'intravenous', '0-11'],
 [0, 'T22', 11, 12, 'Drug', 'Bactrim', '0-11'],
 [0, 'T15', 13, 14, 'Drug', 'prednisone', '0-11'],
 [0, 'T34', 16, 19, 'Reason', 'possible PCP infection', '0-11'],
 [0, 'T19', 20, 21, 'Drug', 'levofloxacin', '0-11'],
 [0, 'T35', 23, 25, 'Reason', 'community-acquired pneumonia', '0-11'],
 [0, 'T36', 10, 11, 'Drug', 'morphine', '0-17'],
 [0, 'T25', 12, 13, 'Drug', 'pantoprazole', '0-17'],
 [0, 'T5', 14, 15,

In [10]:
sents_dict

{'0-0': 'He developed throbbing frontal occipital headache eyes and dry mouth and the headache was alleviated only slightly with ibuprofen',
 '0-2': 'PAST MEDICAL HISTORY: HIV',
 '0-3': 'He was in no previous HAART',
 '0-4': 'Two weeks ago, CD4 count was 171, his viral load 200,000 and was started on Bactrim for Pneumocystis carinii prophylaxis',
 '0-6': 'MEDICATIONS PRIOR TO ADMISSION: Bactrim , acyclovir',
 '0-8': 'MCV is 85, sodium 136, potassium 3.4, chloride 102, HCO3 25, BUN 9, creatinine 0.9, glucose',
 '0-11': 'Induced sputums were performed and the patient was started on intravenous Bactrim and prednisone for a possible PCP infection and levofloxacin for any community-acquired pneumonia',
 '0-17': 'The patient was also feeling increasingly anxious and was given morphine , pantoprazole , zolpidem , and heparin',
 '0-21': 'The patient was sedated through Fentanyl , lorazepam and also received nicotine',
 '0-25': 'As Bactrim has been described to be associated with adult respirat

In [11]:
entity_df = pd.DataFrame(tags_list, columns=['text_id','entity_id','start','end','concept','entity','sent_num']).set_index('entity_id')

In [12]:
entity_df

Unnamed: 0_level_0,text_id,start,end,concept,entity,sent_num
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T29,0,12,13,Reason,headache,0-0
T12,0,18,19,Drug,ibuprofen,0-0
T31,0,3,4,Reason,HIV,0-2
T30,0,5,6,Drug,HAART,0-3
T7,0,15,16,Drug,Bactrim,0-4
...,...,...,...,...,...,...
T133,302,9,11,Dosage,One (1),302-129
T49,302,11,12,Form,Tablet,302-129
T51,302,12,13,Route,PO,302-129
T50,302,13,16,Frequency,once a day,302-129


In [13]:
'''
This is to generate relations to a specific format for one of my model, a relation triplet is formated as 
"start_position_1st_entity, end_position_1st_entity, start_position_2nd_entity, end_position_2nd_entity,
relation_type, 1st_entity_type, 2nd_entity_type" Each relation is concatenated by all the relation triplets
within the sentence, seperated by '|'
'''

relations = []
sents = []
for i, an in enumerate(tqdm(anno)):
    text = an.get_text()
    relation = ''
    entity_curr = entity_df[entity_df['text_id']==i]#['sent_num']
    curr_sent = 0
    for k in an.relations.values():
        if k[0][0] not in entity_curr.index or k[1][0] not in entity_curr.index:
            continue
        e1, e2, rel = entity_curr.loc[k[0][0]], entity_curr.loc[k[1][0]], k[2]

        if e1['sent_num'] != e2['sent_num']:
            continue
        if curr_sent == 0:
            relation = ' '.join([str(e1['start']), str(e1['end']-1), str(e2['start']), str(e2['end']-1),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3], '|'])
            curr_sent = e1['sent_num']
        elif e1['sent_num'] == curr_sent:           
            relation = ' '.join([relation, str(e1['start']), str(e1['end']-1), str(e2['start']), str(e2['end']-1),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3], '|'])
        else:
            relations.append(relation[:-2])  
            sents.append(sents_dict[curr_sent])
            relation = ' '.join([str(e1['start']), str(e1['end']-1), str(e2['start']), str(e2['end']-1),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3],'|'])
            curr_sent = e1['sent_num']
        if k[0][3] == 'Reason' and k[1][3]=='Drug' and rel == 'ADE-Drug':
            print(k)   



  0%|          | 0/303 [00:00<?, ?it/s]

(('T243', 34234, 34247, 'Reason', 'Hyperglycemia'), ('T241', 34321, 34329, 'Drug', 'steroids'), 'ADE-Drug')
(('T111', 1378, 1385, 'Reason', 'itching'), ('T108', 1367, 1370, 'Drug', 'FFP'), 'ADE-Drug')
(('T119', 5163, 5171, 'Reason', 'GI bleed'), ('T7', 5257, 5265, 'Drug', 'coumadin'), 'ADE-Drug')
(('T119', 5163, 5171, 'Reason', 'GI bleed'), ('T26', 5287, 5294, 'Drug', 'aspirin'), 'ADE-Drug')
(('T179', 13587, 13603, 'Reason', 'low blood counts'), ('T177', 13552, 13562, 'Drug', 'vancomycin'), 'ADE-Drug')
(('T179', 13587, 13603, 'Reason', 'low blood counts'), ('T121', 13566, 13574, 'Drug', 'cefepime'), 'ADE-Drug')
(('T87', 8275, 8288, 'Reason', 'Hyperglycemia'), ('T85', 8314, 8322, 'Drug', 'steroids'), 'ADE-Drug')
(('T113', 6733, 6749, 'Reason', 'supratherapeutic'), ('T40', 6686, 6694, 'Drug', 'coumadin'), 'ADE-Drug')
(('T113', 9636, 9650, 'Reason', 'gastric ulcers'), ('T111', 9674, 9679, 'Drug', 'NSAID'), 'ADE-Drug')
(('T183', 17789, 17811, 'Reason', 'Junctional Bradycardia'), ('T184', 1

In [14]:
relations[154]

'7 8 5 6 /Strength-Drug/Strength/Drug | 9 11 5 6 /Form-Drug/Form/Drug | 13 14 5 6 /Dosage-Drug/Dosage/Drug | 15 17 5 6 /Form-Drug/Form/Drug | 18 18 5 6 /Route-Drug/Route/Drug | 19 21 5 6 /Frequency-Drug/Frequency/Drug | 22 24 5 6 /Duration-Drug/Duration/Drug'

In [15]:
sents[154]

'Disp:*5 Tablet(s) * Refills:*0* 15. potassium chloride 10 mEq Tablet Extended Release Sig: One (1) Tablet Extended Release PO once a day for 5 days'