In [None]:
import os
from tqdm.notebook import tqdm
import glob
import numpy as np
import pandas as pd
import random
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize

# Read files and annotations

In [2]:
#This class is provided by i2b2 challenge. I use its functions to get annotations and text

class RecordTrack2(object):
    """Record for Track 2 class."""

    def __init__(self, file_path):
        """Initialize."""
        self.path = os.path.abspath(file_path)
        self.basename = os.path.basename(self.path)
        self.annotations = self._get_annotations()
        # self.text = self._get_text()

    @property
    def tags(self):
        return self.annotations['tags']

    @property
    def relations(self):
        return self.annotations['relations']

    def _get_annotations(self):
        """Return a dictionary with all the annotations in the .ann file."""
        annotations = defaultdict(dict)
        with open(self.path) as annotation_file:
            lines = annotation_file.readlines()
            for line_num, line in enumerate(lines):
                if line.strip().startswith('T'):
                    try:
                        tag_id, tag_m, tag_text = line.strip().split('\t')
                    except ValueError:
                        print(self.path, line)
                    if len(tag_m.split(' ')) == 3:
                        tag_type, tag_start, tag_end = tag_m.split(' ')
                    elif len(tag_m.split(' ')) == 4:
                        tag_type, tag_start, _, tag_end = tag_m.split(' ')
                    elif len(tag_m.split(' ')) == 5:
                        tag_type, tag_start, _, _, tag_end = tag_m.split(' ')
                    else:
                        print(self.path)
                        print(line)
                    tag_start, tag_end = int(tag_start), int(tag_end)
                    annotations['tags'][tag_id] = (tag_id, tag_start ,tag_end, tag_type, tag_text)
            for line_num, line in enumerate(lines):
                if line.strip().startswith('R'):
                    rel_id, rel_m = line.strip().split('\t')
                    rel_type, rel_arg1, rel_arg2 = rel_m.split(' ')
                    rel_arg1 = rel_arg1.split(':')[1]
                    rel_arg2 = rel_arg2.split(':')[1]
                    arg1 = annotations['tags'][rel_arg1]
                    arg2 = annotations['tags'][rel_arg2]
                    annotations['relations'][rel_id] = (arg1,arg2, rel_type)
        return annotations

    def get_text(self):
        """Return the text in the corresponding txt file."""
        path = self.path.replace('.ann', '.txt')
        with open(path) as text_file:
            text = text_file.read()
        return text


In [3]:
#reading data and create a list of RecordTrack2 class for each file
folder1 = 'training_20180910/'
anno = []
files1 = set([os.path.basename(f) for f in glob.glob(
    os.path.join(folder1, '*.ann'))])
for file in files1:
    anno.append(RecordTrack2(os.path.join(folder1, file)))

# Generate entities within sentence

Find the offset of each entity within that sentence. 

The offset provided by i2b2 is aginst the whole file.

In this section, we will locate the entity, tokenise sentense, and remove extra spaces

The offset I generated in this section is word offset (created by len(text.split()), feel free to change to character offset if needed. 


In [5]:
#function to generate the first entity in a sentence
def first_entity(text, start, end, entity):
        text_before = sent_tokenize(text[:start]) #all the text befere the start of the entity
        sent_num = len(text_before)
        word_before = text_before[-1].split() #words before entity in the current sentence 
        word_start = len(word_before) #start position
        word_end = word_start+len(entity.split()) #end position
        sent_after = sent_tokenize(text[end:])[0] #the left of current sentence
        sent_piece = ' '.join(word_before + entity.split()) #current sentence until the end of first entity, extra spaces removed by .split()
        return word_start, word_end, sent_after, sent_piece

In [6]:
#function to generate non-first entity in a sentence, same logic sa first_entity function
def entity_within_sent(text, word_end_before, end_before, start, end, entity):
    word_between = text[end_before:start].split()
    word_start = word_end_before + len(word_between)
    word_end = word_start+len(entity.split())
    sent_after = sent_tokenize(text[end:])[0]
    #print(sent_after)
    sent_piece = ' '.join(word_between + entity.split())
    return word_start, word_end, sent_after, sent_piece

In [7]:
tags_list = []
sents_dict = {}
for i, an in enumerate(tqdm(anno)):
    text = an.get_text()
    tags = pd.DataFrame(an.tags.values()).sort_values(1).values
    #skip used to mark how many entities we have processed within the previous sentence
    skip = 0
    for j, (en_id, start, end, concept, entity) in enumerate(tags):
        #we skip the entities we have processed from last sentence, it will bring us to the first entity in current sentence
        if skip>0:
            skip -= 1
            continue
        #remove entities that across sentences
        if len(sent_tokenize(text[start:end]))>1:
            continue
        word_start, word_end, sent_after, sent_piece = first_entity(text, start, end, entity)
        #'-'.join([str(i),str(j)]) is an unique idenfitier for each sentence in the dataset, 
        #i identify the document and j identify the sentence within the document
        tags_list.append([i, en_id, word_start, word_end, concept, entity, '-'.join([str(i),str(j)])])
        while j+skip+1<len(tags) and end+len(sent_after)>= tags[j+skip+1][1]:
            anno_curr = tags[j+skip+1]
            word_end_before = word_end
            end_before = end
            sent_piece_before = sent_piece
            en_id = anno_curr[0]
            start = anno_curr[1]
            end = anno_curr[2]
            concept = anno_curr[3]
            entity = anno_curr[4]
            word_start, word_end, sent_after, sent_piece = entity_within_sent(text, word_end_before, end_before, start, end, entity)
            sent_piece = ' '. join([sent_piece_before, sent_piece])
            tags_list.append([i, en_id, word_start, word_end, concept, entity, '-'.join([str(i),str(j)])])
            skip += 1
        sents_dict['-'.join([str(i),str(j)])] = sent_piece

  0%|          | 0/303 [00:00<?, ?it/s]

In [8]:
tags_list

[[0, 'T107', 4, 5, 'Drug', 'Ca-gluconate', '0-0'],
 [0, 'T108', 6, 8, 'Strength', '10 units', '0-0'],
 [0, 'T110', 9, 10, 'Drug', 'insulin', '0-0'],
 [0, 'T111', 15, 17, 'Strength', '15 mg', '0-0'],
 [0, 'T112', 17, 18, 'Route', 'PO', '0-0'],
 [0, 'T113', 7, 8, 'Drug', 'Potassium', '0-5'],
 [0, 'T114', 10, 11, 'Drug', 'acetaminophen', '0-6'],
 [0, 'T115', 17, 20, 'Drug', 'ibuprofen-containing sleep aid', '0-6'],
 [0, 'T116', 21, 22, 'Reason', 'insomnia', '0-6'],
 [0, 'T129', 14, 15, 'Drug', 'lisinopril', '0-9'],
 [0, 'T40', 16, 17, 'Drug', 'NSAIDs', '0-9'],
 [0, 'T130', 19, 20, 'Drug', 'potassium', '0-9'],
 [0, 'T131', 20, 21, 'Drug', 'citrate', '0-9'],
 [0, 'T132', 5, 6, 'Drug', 'Tamsulosin', '0-13'],
 [0, 'T133', 7, 9, 'Reason', 'his BPH', '0-13'],
 [0, 'T134', 15, 16, 'Drug', 'Potassium', '0-15'],
 [0, 'T135', 21, 22, 'Drug', 'kayexalate', '0-15'],
 [0, 'T136', 23, 24, 'Drug', 'insulin', '0-15'],
 [0, 'T1', 8, 9, 'Drug', 'insulin', '0-18'],
 [0, 'T2', 3, 4, 'Drug', 'Humalog', '0-19'

In [9]:
sents_dict

{'0-0': 'He received 2 grams Ca-gluconate , 10 units of insulin and D50, sodium bicarbonate, and 15 mg PO',
 '0-5': 'He proceeded to have brisk urine output. Potassium',
 '0-6': 'He has had chronic knee pain for which he takes acetaminophen ; he also takes a nightly ibuprofen-containing sleep aid for insomnia',
 '0-9': 'Additionally, he was also on a number of potentially nephrotoxigenic medications causing ATN, including lisinopril , NSAIDs , and potassium citrate',
 '0-13': 'Furthermore, he was started on Tamsulosin for his BPH',
 '0-15': '#Hyperkalemia: He was found to have an elevated K 8 with associated peaked T waves. Potassium levels dropped with administration of kayexalate , insulin',
 '0-18': 'His blood sugars were managed on sliding scale insulin',
 '0-19': 'Since he required Humalog 5-7 units for the last 3 days of hospitalization, he was sent home on Lantus 2units',
 '0-23': 'He was also provided with an educational counseling session regarding home insulin',
 '0-24': 'His

In [10]:
entity_df = pd.DataFrame(tags_list, columns=['text_id','entity_id','start','end','concept','entity','sent_num']).set_index('entity_id')

In [11]:
entity_df

Unnamed: 0_level_0,text_id,start,end,concept,entity,sent_num
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T107,0,4,5,Drug,Ca-gluconate,0-0
T108,0,6,8,Strength,10 units,0-0
T110,0,9,10,Drug,insulin,0-0
T111,0,15,17,Strength,15 mg,0-0
T112,0,17,18,Route,PO,0-0
...,...,...,...,...,...,...
T28,302,26,28,Strength,325 mg,302-14
T29,302,28,30,Frequency,once daily,302-14
T30,302,30,31,Drug,Darvocet,302-14
T31,302,31,32,Frequency,p.r.n,302-14


In [12]:
'''
This is to generate relations to a specific format for one of my model, a relation triplet is formated as 
"start_position_1st_entity, end_position_1st_entity, start_position_2nd_entity, end_position_2nd_entity,
relation_type, 1st_entity_type, 2nd_entity_type" Each relation is concatenated by all the relation triplets
within the sentence, seperated by '|'
'''

relations = []
sents = []
for i, an in enumerate(tqdm(anno)):
    text = an.get_text()
    relation = ''
    entity_curr = entity_df[entity_df['text_id']==i]#['sent_num']
    curr_sent = 0
    for k in an.relations.values():
        if k[0][0] not in entity_curr.index or k[1][0] not in entity_curr.index:
            continue
        e1, e2, rel = entity_curr.loc[k[0][0]], entity_curr.loc[k[1][0]], k[2]

        if e1['sent_num'] != e2['sent_num']:
            continue
        if curr_sent == 0:
            relation = ' '.join([str(e1['start']), str(e1['end']-1), str(e2['start']), str(e2['end']-1),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3], '|'])
            curr_sent = e1['sent_num']
        elif e1['sent_num'] == curr_sent:           
            relation = ' '.join([relation, str(e1['start']), str(e1['end']-1), str(e2['start']), str(e2['end']-1),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3], '|'])
        else:
            relations.append(relation[:-2])  
            sents.append(sents_dict[curr_sent])
            relation = ' '.join([str(e1['start']), str(e1['end']-1), str(e2['start']), str(e2['end']-1),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3],'|'])
            curr_sent = e1['sent_num']
        if k[0][3] == 'Reason' and k[1][3]=='Drug' and rel == 'ADE-Drug':
            print(k)   



  0%|          | 0/303 [00:00<?, ?it/s]

(('T113', 9636, 9650, 'Reason', 'gastric ulcers'), ('T111', 9674, 9679, 'Drug', 'NSAID'), 'ADE-Drug')
(('T179', 13587, 13603, 'Reason', 'low blood counts'), ('T177', 13552, 13562, 'Drug', 'vancomycin'), 'ADE-Drug')
(('T179', 13587, 13603, 'Reason', 'low blood counts'), ('T121', 13566, 13574, 'Drug', 'cefepime'), 'ADE-Drug')
(('T243', 34234, 34247, 'Reason', 'Hyperglycemia'), ('T241', 34321, 34329, 'Drug', 'steroids'), 'ADE-Drug')
(('T118', 13358, 13383, 'Reason', 'red rash on arms and face'), ('T47', 13387, 13397, 'Drug', 'vancomycin'), 'ADE-Drug')
(('T113', 6733, 6749, 'Reason', 'supratherapeutic'), ('T40', 6686, 6694, 'Drug', 'coumadin'), 'ADE-Drug')
(('T59', 5277, 5283, 'Reason', 'rigors'), ('T57', 5263, 5267, 'Drug', 'IL-2'), 'ADE-Drug')
(('T62', 5320, 5331, 'Reason', 'hypotension'), ('T57', 5263, 5267, 'Drug', 'IL-2'), 'ADE-Drug')
(('T87', 8275, 8288, 'Reason', 'Hyperglycemia'), ('T85', 8314, 8322, 'Drug', 'steroids'), 'ADE-Drug')
(('T111', 1378, 1385, 'Reason', 'itching'), ('T108

In [13]:
relations[154]

'6 7 4 5 /Strength-Drug/Strength/Drug | 14 14 4 5 /Form-Drug/Form/Drug | 15 15 4 5 /Route-Drug/Route/Drug'

In [14]:
sents[154]

'Disp:*10 Capsule(s)* Refills:*0* 8. famotidin e 20 mg Tablet [**Month/Year (2) **]: One (1) Tablet PO BID (2 times a day) for 1 days'