In [1]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
import json
import itertools

In [2]:
base_path = "../../data/test/"
file_name = "164726"

In [3]:
ann_str = None
with open(base_path + file_name + ".ann", "r") as file:
    ann_str = file.read()

In [41]:
def parse_entities_relationships(file_content):
    """
    parses out entities and relationships from the entities file
    """
    entities = {}
    relationships = {}

    for line in file_content.strip().split('\n'):
        cols = line.split('\t')
        identifier = cols[0]

        if identifier.startswith('T'):
            entity_type, start, end = cols[1].split()
            value = cols[2]
            entities[identifier] = {
                'type': entity_type,
                'start': int(start),
                'end': int(end),
                'value': value
            }

        elif identifier.startswith('R'):
            relationship_type, arg1, arg2 = cols[1].split()
            relationships[identifier] = {
                'type': relationship_type,
                'arg1': arg1.split(':')[1],
                'arg2': arg2.split(':')[1]
            }

    return entities, relationships


def sentence_start_end(sentences, txt_str):
    """
    get starting and ending positions of sentences
    """
    sent_positions = []
    for sent in sentences:
        start = txt_str.find(sent)
        end = start + len(sent)
        sent_positions.append((start, end))
    return sent_positions


def update_entity_indices(entities, sent_positions):
    """
    for each entity update its position relative to this sentence
    """
    for entity_key in entities:
        entity = entities[entity_key]
        start = entity['start']
        end = entity['end']
        for i, (sent_start, sent_end) in enumerate(sent_positions):
            if start >= sent_start and end <= sent_end:
                entity['sentence'] = i
                entity['sent_start'] = start - sent_start
                entity['sent_end'] = end - sent_start
                break
    return entities


def get_relationship_spans(entities, relationships, txt_str):
    """
    get relationship spans
    """
    sentences = sent_tokenize(txt_str)
    entities = update_entity_indices(entities, sentence_start_end(sentences, txt_str))

    relationship_spans = {}

    for rel_id, relationship in relationships.items():
        arg1 = entities[relationship['arg1']]
        arg2 = entities[relationship['arg2']]
        if arg1['sentence'] <= arg2['sentence']:
            sent_start = arg1['sentence']
            sent_end = arg2['sentence']
            arg1_start = arg1['sent_start']
            arg1_end = arg1['sent_end']
            offset = len(' '.join(sentences[sent_start: sent_end]))
            arg2_start = offset + arg2['sent_start']
            arg2_end = offset + arg2['sent_end']
        else:
            sent_start = arg2['sentence']
            sent_end = arg1['sentence']
            arg2_start = arg2['sent_start']
            arg2_end = arg2['sent_end']
            offset = len(' '.join(sentences[sent_start: sent_end]))
            arg1_start = offset + arg1['sent_start']
            arg1_end = offset + arg1['sent_end']
        
        span = ' '.join(sentences[sent_start:sent_end+1])

        assert (
            (span[arg1_start:arg1_end] == arg1['value']) and 
            (span[arg2_start:arg2_end] == arg2['value'])
        )

        if span not in relationship_spans:
            relationship_spans[span] = {
                'relationships': {},
                'all_entities': {}
            }

        relationship_spans[span]['relationships'][rel_id] = {
            'type': relationship['type'],
            'arg1': relationship['arg1'],
            'arg1_start': arg1_start,
            'arg1_end': arg1_end,
            'arg2': relationship['arg2'],
            'arg2_start': arg2_start,
            'arg2_end': arg2_end
        }
        if relationship['arg1'] not in relationship_spans[span]['all_entities']:
            ent_data = {
                'start_pos': arg1_start,
                'end_pos': arg1_end,
                'text': arg1['value'],
                'type': arg1['type']
            }
            relationship_spans[span]['all_entities'][relationship['arg1']] = ent_data
        
        if relationship['arg2'] not in relationship_spans[span]['all_entities']:
            ent_data = {
                'start_pos': arg2_start,
                'end_pos': arg2_end,
                'text': arg2['value'],
                'type': arg2['type']
            }
            relationship_spans[span]['all_entities'][relationship['arg2']] = ent_data
    return relationship_spans


def get_uncovered_entity_spans(entities_map, relations, txt_str):
    sentences = sent_tokenize(txt_str)
    updated_entities = update_entity_indices(
        entities_map, sentence_start_end(sentences, txt_str)
    )
    ents_covered = set()
    for v in relations.values():
        ents_covered = ents_covered.union({v['arg1']}).union({v['arg2']})
        
    uncovered_dict = {k:v for k, v in updated_entities.items() if k not in ents_covered}

    spans = {}
    for ent, loc in uncovered_dict.items():
        sent = sentences[loc['sentence']]
        if sent not in spans:
            spans[sent] = {}
            
        ent_data = {
            'start_pos': loc['sent_start'],
            'end_pos': loc['sent_end'],
            'text': loc['value'],
            'type': loc['type']
        }
        spans[sent][ent] = ent_data
    return spans


def get_ent_tags(text, ent_type, ent_text, ent_loc):
    index_tag_map = {}
    tokens = np.array(nltk.word_tokenize(text))
    ent_tokens = nltk.word_tokenize(ent_text)
    possible_loc = np.where(tokens==ent_tokens[0])[0]
    ent_start_loc = possible_loc[np.argmin(
        [abs(ent_loc - len(' '.join(tokens[0:x]))) for x in possible_loc]
    )]
    ent_end_loc = ent_start_loc + len(ent_tokens) - 1
    if len(ent_tokens) == 1:
        index_tag_map[ent_start_loc] = f"S-{ent_type}"
    else:
        for i in range(ent_start_loc, ent_end_loc):
            if i == ent_start_loc:
                index_tag_map[i] = f"B-{ent_type}"
            elif i == ent_end_loc - 1:
                index_tag_map[i] = f"E-{ent_type}"
            else:
                index_tag_map[i] = f"I-{ent_type}"
    return index_tag_map, ent_start_loc, ent_end_loc


def get_token_tags(text, all_ents, all_rels=None):
    tokens = nltk.word_tokenize(text)
    tags = ['O'] * len(tokens)
    tag_dict = {}
    ent_dict = {}
    for eid, val in all_ents.items():
        tmp, s_idx, e_idx = get_ent_tags(
            text, val['type'], val['text'], val['start_pos']
        )
        tag_dict = tag_dict | tmp
        ent_dict[eid] = {
            'start_idx': s_idx,
            'end_idx': e_idx
        }
    for i, t in tag_dict.items():
        tags[i] = t
        
    if all_rels:
        rels = all_rels.copy()
        for _, rdata in rels.items():
            rdata['arg1_start'] = ent_dict[rdata['arg1']]['start_idx']
            rdata['arg1_end'] = ent_dict[rdata['arg1']]['end_idx']
            rdata['arg2_start'] = ent_dict[rdata['arg2']]['start_idx']
            rdata['arg2_end'] = ent_dict[rdata['arg2']]['end_idx']
        return list(zip(tokens, tags)), rels
    else:
        return list(zip(tokens, tags))

In [5]:
txt_str = None
with open(base_path + file_name + ".txt", "r") as file:
    txt_str = file.read()

In [6]:
ent, rel = parse_entities_relationships(ann_str)

In [7]:
rel_spans = get_relationship_spans(ent, rel, txt_str)

In [46]:
rel_spans

{'Aggressive diuresis\nwith IV lasix and diuril were unsuccessful so CVVH was\ninitiated.': {'relationships': {'R2': {'type': 'Reason-Drug',
    'arg1': 'T6',
    'arg1_start': 11,
    'arg1_end': 19,
    'arg2': 'T8',
    'arg2_start': 28,
    'arg2_end': 33},
   'R3': {'type': 'Reason-Drug',
    'arg1': 'T6',
    'arg1_start': 11,
    'arg1_end': 19,
    'arg2': 'T9',
    'arg2_start': 38,
    'arg2_end': 44},
   'R4': {'type': 'Route-Drug',
    'arg1': 'T7',
    'arg1_start': 25,
    'arg1_end': 27,
    'arg2': 'T8',
    'arg2_start': 28,
    'arg2_end': 33}},
  'all_entities': {'T6': {'start_pos': 11,
    'end_pos': 19,
    'text': 'diuresis',
    'type': 'Reason'},
   'T8': {'start_pos': 28, 'end_pos': 33, 'text': 'lasix', 'type': 'Drug'},
   'T9': {'start_pos': 38, 'end_pos': 44, 'text': 'diuril', 'type': 'Drug'},
   'T7': {'start_pos': 25, 'end_pos': 27, 'text': 'IV', 'type': 'Route'}}},
 'At this point she had become hemodynamically unstable\nrequiring pressor support with dopa

In [42]:
uncovered_ent_spans = get_uncovered_entity_spans(ent, rel, txt_str)

In [43]:
uncovered_ent_spans

{'Family\nrefused TPA due to hx of thrombocytopenia.': {'T1': {'start_pos': 15,
   'end_pos': 18,
   'text': 'TPA',
   'type': 'Drug'}},
 'IV nitro inititated,\nheparin, aspirin initiated and now painfree.': {'T4': {'start_pos': 21,
   'end_pos': 28,
   'text': 'heparin',
   'type': 'Drug'},
  'T5': {'start_pos': 30, 'end_pos': 37, 'text': 'aspirin', 'type': 'Drug'}},
 'Her course was\ncomplicated by acute renal failure, acute heart failure which\nultimately led to pressor support.': {'T50': {'start_pos': 95,
   'end_pos': 102,
   'text': 'pressor',
   'type': 'Drug'}}}

In [8]:
for k, v in rel_spans.items():
    print(f"{k}\n")
    print(v)
    break

Aggressive diuresis
with IV lasix and diuril were unsuccessful so CVVH was
initiated.

{'relationships': {'R2': {'type': 'Reason-Drug', 'arg1': 'T6', 'arg1_start': 11, 'arg1_end': 19, 'arg2': 'T8', 'arg2_start': 28, 'arg2_end': 33}, 'R3': {'type': 'Reason-Drug', 'arg1': 'T6', 'arg1_start': 11, 'arg1_end': 19, 'arg2': 'T9', 'arg2_start': 38, 'arg2_end': 44}, 'R4': {'type': 'Route-Drug', 'arg1': 'T7', 'arg1_start': 25, 'arg1_end': 27, 'arg2': 'T8', 'arg2_start': 28, 'arg2_end': 33}}, 'all_entities': {'T6': {'start_pos': 11, 'end_pos': 19, 'text': 'diuresis', 'type': 'Reason'}, 'T8': {'start_pos': 28, 'end_pos': 33, 'text': 'lasix', 'type': 'Drug'}, 'T9': {'start_pos': 38, 'end_pos': 44, 'text': 'diuril', 'type': 'Drug'}, 'T7': {'start_pos': 25, 'end_pos': 27, 'text': 'IV', 'type': 'Route'}}}


In [121]:
v

{'relationships': {'R2': {'type': 'Reason-Drug',
   'arg1': 'T6',
   'arg1_start': 11,
   'arg1_end': 19,
   'arg2': 'T8',
   'arg2_start': 28,
   'arg2_end': 33},
  'R3': {'type': 'Reason-Drug',
   'arg1': 'T6',
   'arg1_start': 11,
   'arg1_end': 19,
   'arg2': 'T9',
   'arg2_start': 38,
   'arg2_end': 44},
  'R4': {'type': 'Route-Drug',
   'arg1': 'T7',
   'arg1_start': 25,
   'arg1_end': 27,
   'arg2': 'T8',
   'arg2_start': 28,
   'arg2_end': 33}},
 'all_entities': {'T6': {'start_pos': 11,
   'end_pos': 19,
   'text': 'diuresis',
   'type': 'Reason'},
  'T8': {'start_pos': 28, 'end_pos': 33, 'text': 'lasix', 'type': 'Drug'},
  'T9': {'start_pos': 38, 'end_pos': 44, 'text': 'diuril', 'type': 'Drug'},
  'T7': {'start_pos': 25, 'end_pos': 27, 'text': 'IV', 'type': 'Route'}}}

In [126]:
for s
spans['all_entity'].keys()

KeyError: 'all_entity'

In [112]:
t = 'Aggressive diuresis\nwith IV lasix and diuril were unsuccessful so CVVH was\ninitiated.'
get_token_tags(t, spans[t]['all_entities'], spans[t]['relationships'])

([('Aggressive', 'O'),
  ('diuresis', 'S-Reason'),
  ('with', 'O'),
  ('IV', 'S-Route'),
  ('lasix', 'S-Drug'),
  ('and', 'O'),
  ('diuril', 'S-Drug'),
  ('were', 'O'),
  ('unsuccessful', 'O'),
  ('so', 'O'),
  ('CVVH', 'O'),
  ('was', 'O'),
  ('initiated', 'O'),
  ('.', 'O')],
 {'R2': {'type': 'Reason-Drug',
   'arg1': 'T6',
   'arg1_start': 1,
   'arg1_end': 1,
   'arg2': 'T8',
   'arg2_start': 4,
   'arg2_end': 4},
  'R3': {'type': 'Reason-Drug',
   'arg1': 'T6',
   'arg1_start': 1,
   'arg1_end': 1,
   'arg2': 'T9',
   'arg2_start': 6,
   'arg2_end': 6},
  'R4': {'type': 'Route-Drug',
   'arg1': 'T7',
   'arg1_start': 3,
   'arg1_end': 3,
   'arg2': 'T8',
   'arg2_start': 4,
   'arg2_end': 4}})

In [65]:
def process_sentences_without_relationships(sentences, sent_positions, entities, relationships, relationship_spans):
    processed_sentences = {entities[rel['arg1']]['sentence'] for rel in relationships.values()}
    processed_sentences.update({entities[rel['arg2']]['sentence'] for rel in relationships.values()})
    for i, (sent_start, sent_end) in enumerate(sent_positions):
        if i not in processed_sentences:
            entities_in_sent = [e for e in entities.values() if e['sentence'] == i]
            if len(entities_in_sent) > 1:
                span = sentences[i]

                if span not in relationship_spans:
                    relationship_spans[span] = {
                        'relationships': [],
                        'entities': {}
                    }

                for j, (e1, e2) in enumerate(zip(entities_in_sent[:-1], entities_in_sent[1:])):
                    print(e1['id'], e2['id'])
                    relationship_spans[span]['relationships'].append({
                        'id': f'NOREL{i}_{j}',
                        'type': 'NOREL',
                        'arg1': e1['id'],
                        'arg2': e2['id']
                    })

                for e in entities_in_sent:
                    relationship_spans[span]['entities'][e['id']] = e
    return relationship_spans

In [66]:
norels = process_sentences_without_relationships(sentences, sent_pos, ent, rel, spans)

In [67]:
norels

{'IV nitro inititated,\nheparin, aspirin initiated and now painfree.': {'relationships': [{'id': 'R1',
    'type': 'Route-Drug',
    'arg1': 'T2',
    'arg2': 'T3'}],
  'entities': {'T2': {'type': 'Route',
    'start': 1376,
    'end': 1378,
    'value': 'IV',
    'sentence': 5,
    'sent_start': 0,
    'sent_end': 2},
   'T3': {'type': 'Drug',
    'start': 1379,
    'end': 1384,
    'value': 'nitro',
    'sentence': 5,
    'sent_start': 3,
    'sent_end': 8}}},
 'Aggressive diuresis\nwith IV lasix and diuril were unsuccessful so CVVH was\ninitiated.': {'relationships': [{'id': 'R2',
    'type': 'Reason-Drug',
    'arg1': 'T6',
    'arg2': 'T8'},
   {'id': 'R3', 'type': 'Reason-Drug', 'arg1': 'T6', 'arg2': 'T9'},
   {'id': 'R4', 'type': 'Route-Drug', 'arg1': 'T7', 'arg2': 'T8'},
   {'id': 'R5', 'type': 'Route-Drug', 'arg1': 'T7', 'arg2': 'T9'}],
  'entities': {'T6': {'type': 'Reason',
    'start': 5420,
    'end': 5428,
    'value': 'diuresis',
    'sentence': 50,
    'sent_start': 11,