In [1]:
import os
from tqdm.notebook import tqdm
import glob
import numpy as np
import pandas as pd
import random
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize

# Read files and annotations

In [2]:
#This class is provided by i2b2 challenge. I use its functions to get annotations and text

class RecordTrack2(object):
    """Record for Track 2 class."""

    def __init__(self, file_path):
        """Initialize."""
        self.path = os.path.abspath(file_path)
        self.basename = os.path.basename(self.path)
        self.annotations = self._get_annotations()
        # self.text = self._get_text()

    @property
    def tags(self):
        return self.annotations['tags']

    @property
    def relations(self):
        return self.annotations['relations']

    def _get_annotations(self):
        """Return a dictionary with all the annotations in the .ann file."""
        annotations = defaultdict(dict)
        with open(self.path) as annotation_file:
            lines = annotation_file.readlines()
            for line_num, line in enumerate(lines):
                if line.strip().startswith('T'):
                    try:
                        tag_id, tag_m, tag_text = line.strip().split('\t')
                    except ValueError:
                        print(self.path, line)
                    if len(tag_m.split(' ')) == 3:
                        tag_type, tag_start, tag_end = tag_m.split(' ')
                    elif len(tag_m.split(' ')) == 4:
                        tag_type, tag_start, _, tag_end = tag_m.split(' ')
                    elif len(tag_m.split(' ')) == 5:
                        tag_type, tag_start, _, _, tag_end = tag_m.split(' ')
                    else:
                        print(self.path)
                        print(line)
                    tag_start, tag_end = int(tag_start), int(tag_end)
                    annotations['tags'][tag_id] = (tag_id, tag_start ,tag_end, tag_type, tag_text)
            for line_num, line in enumerate(lines):
                if line.strip().startswith('R'):
                    rel_id, rel_m = line.strip().split('\t')
                    rel_type, rel_arg1, rel_arg2 = rel_m.split(' ')
                    rel_arg1 = rel_arg1.split(':')[1]
                    rel_arg2 = rel_arg2.split(':')[1]
                    arg1 = annotations['tags'][rel_arg1]
                    arg2 = annotations['tags'][rel_arg2]
                    annotations['relations'][rel_id] = (arg1,arg2, rel_type)
        return annotations

    def get_text(self):
        """Return the text in the corresponding txt file."""
        path = self.path.replace('.ann', '.txt')
        with open(path) as text_file:
            text = text_file.read()
        return text


In [3]:
#reading data and create a list of RecordTrack2 class for each file
folder1 = 'training_20180910/'
anno = []
files1 = set([os.path.basename(f) for f in glob.glob(
    os.path.join(folder1, '*.ann'))])
for file in files1:
    anno.append(RecordTrack2(os.path.join(folder1, file)))

# Generate entities within sentence

Find the offset of each entity within that sentence. 

The offset provided by i2b2 is aginst the whole file.

In this section, we will locate the entity, tokenise sentense, and remove extra spaces

The offset I generated in this section is word offset (created by len(text.split()), feel free to change to character offset if needed. 


In [4]:
#function to generate the first entity in a sentence
def first_entity(text, start, end, entity):
        text_before = sent_tokenize(text[:start]) #all the text befere the start of the entity
        sent_num = len(text_before)
        word_before = text_before[-1].split() #words before entity in the current sentence 
        word_start = len(word_before) #start position
        word_end = word_start+len(entity.split()) #end position
        sent_after = sent_tokenize(text[end:])[0] #the left of current sentence
        sent_piece = ' '.join(word_before + entity.split()) #current sentence until the end of first entity, extra spaces removed by .split()
        return word_start, word_end, sent_after, sent_piece

In [5]:
#function to generate non-first entity in a sentence, same logic sa first_entity function
def entity_within_sent(text, word_end_before, end_before, start, end, entity):
    word_between = text[end_before:start].split()
    word_start = word_end_before + len(word_between)
    word_end = word_start+len(entity.split())
    sent_after = sent_tokenize(text[end:])[0]
    #print(sent_after)
    sent_piece = ' '.join(word_between + entity.split())
    return word_start, word_end, sent_after, sent_piece

In [6]:
tags_list = []
sents_dict = {}
for i, an in enumerate(tqdm(anno)):
    text = an.get_text()
    tags = pd.DataFrame(an.tags.values()).sort_values(1).values
    #skip used to mark how many entities we have processed within the previous sentence
    skip = 0
    for j, (en_id, start, end, concept, entity) in enumerate(tags):
        #we skip the entities we have processed from last sentence, it will bring us to the first entity in current sentence
        if skip>0:
            skip -= 1
            continue
        #remove entities that across sentences
        if len(sent_tokenize(text[start:end]))>1:
            continue
        word_start, word_end, sent_after, sent_piece = first_entity(text, start, end, entity)
        #'-'.join([str(i),str(j)]) is an unique idenfitier for each sentence in the dataset, 
        #i identify the document and j identify the sentence within the document
        tags_list.append([i, en_id, word_start, word_end, concept, entity, '-'.join([str(i),str(j)])])
        while j+skip+1<len(tags) and end+len(sent_after)>= tags[j+skip+1][1]:
            anno_curr = tags[j+skip+1]
            word_end_before = word_end
            end_before = end
            sent_piece_before = sent_piece
            en_id = anno_curr[0]
            start = anno_curr[1]
            end = anno_curr[2]
            concept = anno_curr[3]
            entity = anno_curr[4]
            word_start, word_end, sent_after, sent_piece = entity_within_sent(text, word_end_before, end_before, start, end, entity)
            sent_piece = ' '. join([sent_piece_before, sent_piece])
            tags_list.append([i, en_id, word_start, word_end, concept, entity, '-'.join([str(i),str(j)])])
            skip += 1
        sents_dict['-'.join([str(i),str(j)])] = sent_piece

  0%|          | 0/303 [00:00<?, ?it/s]

In [7]:
tags_list

[[0, 'T63', 4, 5, 'Drug', 'ampicillin', '0-0'],
 [0, 'T64', 5, 6, 'Strength', '2gm', '0-0'],
 [0, 'T65', 6, 7, 'Route', 'IV', '0-0'],
 [0, 'T66', 9, 10, 'Drug', 'gentamicin', '0-0'],
 [0, 'T67', 10, 11, 'Strength', '80mg', '0-0'],
 [0, 'T68', 11, 12, 'Route', 'IV', '0-0'],
 [0, 'T104', 6, 7, 'Drug', 'anti-biotic', '0-6'],
 [0, 'T105', 4, 9, 'Reason', 'fevers at home of 103.8', '0-7'],
 [0, 'T34', 11, 12, 'Drug', 'tylenol', '0-7'],
 [0, 'T69', 9, 10, 'Drug', 'levo', '0-9'],
 [0, 'T29', 11, 12, 'Drug', 'flagyl', '0-9'],
 [0, 'T70', 5, 6, 'Route', 'iv', '0-11'],
 [0, 'T101', 6, 7, 'Drug', 'rehydration', '0-11'],
 [0, 'T102', 8, 9, 'Route', 'iv', '0-11'],
 [0, 'T71', 9, 10, 'Drug', 'abx', '0-11'],
 [0, 'T72', 3, 4, 'Drug', 'IVF', '0-15'],
 [0, 'T73', 5, 6, 'Drug', 'levo', '0-15'],
 [0, 'T74', 7, 8, 'Drug', 'flagyl', '0-15'],
 [0, 'T121', 8, 10, 'Drug', 'O2 sat', '0-18'],
 [0, 'T106', 13, 16, 'Drug', 'third generation cephalosporins', '0-19'],
 [0, 'T20', 18, 20, 'Drug', 'Trimethoprim /Sulf

In [8]:
(sents_dict)

{'0-0': 'He was pretreated with ampicillin 2gm IV , and gentamicin 80mg IV',
 '0-6': 'He was NOT discharged on any anti-biotic',
 '0-7': 'c/o fatigue, with reported fevers at home of 103.8 --pt took tylenol',
 '0-9': 'wcc is 20. pt was pancultured and started on levo and flagyl',
 '0-11': 'pt was therefore admitted for iv rehydration , iv abx',
 '0-15': 'Blood cx sent. IVF given, levo , flagyl',
 '0-18': 'T 102.0(R); hr 57; BP 104/45; rr 21 O2 sat',
 '0-19': 'For serious infections, repeat culture and sensitivity testing may therefore be warranted if third generation cephalosporins',
 '0-20': 'For serious infections, repeat culture and sensitivity testing may therefore be warranted if third generation cephalosporins were used. Trimethoprim /Sulfa',
 '0-21': 'SENSITIVITIES: MIC expressed in MCG/ML _________________________________________________________ ENTEROBACTER CLOACAE | CEFEPIME -------------- <=1 S CEFTAZIDIME ----------- <=1 S CEFTRIAXONE ----------- <=1 S CIPROFLOXACIN -------

In [81]:
import json
#with open("sents_dict.txt", "w") as fp:
    json.dump(sents_dict, fp)  # encode dict into JSON

In [9]:
entity_df = pd.DataFrame(tags_list, columns=['text_id','entity_id','start','end','concept','entity','sent_num']).set_index('entity_id')

In [10]:
entity_df['concept'].unique()

array(['Drug', 'Strength', 'Route', 'Reason', 'Duration', 'ADE', 'Dosage',
       'Form', 'Frequency'], dtype=object)

In [85]:
#entity_df.to_csv('entity.csv', sep=',', encoding='utf-8')

In [61]:
'''
This is to generate relations to a specific format for one of my model, a relation triplet is formated as 
"start_position_1st_entity, end_position_1st_entity, start_position_2nd_entity, end_position_2nd_entity,
relation_type, 1st_entity_type, 2nd_entity_type" Each relation is concatenated by all the relation triplets
within the sentence, seperated by '|'
'''

relations = []
sents = []
for i, an in enumerate(tqdm(anno)):
    text = an.get_text()
    relation = ''
    entity_curr = entity_df[entity_df['text_id']==i]#['sent_num']
    curr_sent = 0
    relation_df = pd.DataFrame.from_dict(an.relations,orient='index')#.values()
    relation_df['head_entity'] = relation_df[1].apply(lambda x: int(x[1]))
    relation_df = relation_df.sort_values(['head_entity']).drop('head_entity',axis = 1)
    for k in relation_df.values:
        if k[0][0] not in entity_curr.index or k[1][0] not in entity_curr.index:
            continue
        e1, e2, rel = entity_curr.loc[k[0][0]], entity_curr.loc[k[1][0]], k[2]

 

        if e1['sent_num'] != e2['sent_num']:
            continue
        if curr_sent == 0:
            relation = ' '.join([str(e1['start']), str(e1['end']), str(e2['start']), str(e2['end']),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3], '|'])
            curr_sent = e1['sent_num']
        elif e1['sent_num'] == curr_sent:           
            relation = ' '.join([relation, str(e1['start']), str(e1['end']), str(e2['start']), str(e2['end']),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3], '|'])
        else:
            #print(curr_sent)
            relations.append(relation[:-2])  
            sents.append(sents_dict[curr_sent])
            curr_sent = e1['sent_num']
            relation = ' '.join([str(e1['start']), str(e1['end']), str(e2['start']), str(e2['end']),
                                   '/'+rel+'/'+k[0][3]+'/'+k[1][3],'|'])
    if curr_sent == 0:
        continue
    relations.append(relation[:-2])  
    sents.append(sents_dict[curr_sent])
    relation = ' '.join([str(e1['start']), str(e1['end']), str(e2['start']), str(e2['end']),
                           '/'+rel+'/'+k[0][3]+'/'+k[1][3],'|'])



  0%|          | 0/303 [00:00<?, ?it/s]

In [62]:
relations[6]

'1 3 8 9 /Reason-Drug/Reason/Drug | 1 3 11 13 /Reason-Drug/Reason/Drug'

In [64]:
import re
print(re.findall('\d+', relations[6]))

['1', '3', '8', '9', '1', '3', '11', '13']


In [65]:
i_relation = relations[6].split("| ")
sample = []
for i in i_relation:
    relation_sample = []
    relation_sample.append((int(i.split(" ")[0]),int(i.split(" ")[1])))
    relation_sample.append((int(i.split(" ")[2]),int(i.split(" ")[3])))
    relation_sample.append(i.split(" ")[4].split("/")[1])
    sample.append(relation_sample)
print(sample)

[[(1, 3), (8, 9), 'Reason-Drug'], [(1, 3), (11, 13), 'Reason-Drug']]


In [66]:
sents[6].split()

['5)',
 'h/o',
 'CAD',
 ':',
 'Patient',
 'is',
 'on',
 'an',
 'aspirin',
 'and',
 'a',
 'beta',
 'blocker']

In [67]:
relation_data = []
for i in range(len(sents)):
    sample = []
    sample.append(sents[i].split())
    i_relation = relations[i].split("| ")
    i_sample = []
    for i in i_relation:
        relation_sample = []
        relation_sample.append((int(i.split(" ")[0]),int(i.split(" ")[1])))
        relation_sample.append((int(i.split(" ")[2]),int(i.split(" ")[3])))
        relation_sample.append(i.split(" ")[4].split("/")[1])
        i_sample.append(relation_sample)
    sample.append(i_sample)
    relation_data.append(sample)

In [68]:
relation_data[6]

[['5)',
  'h/o',
  'CAD',
  ':',
  'Patient',
  'is',
  'on',
  'an',
  'aspirin',
  'and',
  'a',
  'beta',
  'blocker'],
 [[(1, 3), (8, 9), 'Reason-Drug'], [(1, 3), (11, 13), 'Reason-Drug']]]

In [69]:
import json
with open("relation.txt", "w") as fp:
    json.dump(relation_data, fp)