In [1]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import json

In [2]:
base_path = "../data/data_sample/"
file_name = "164726"

In [3]:
ann_str = None
with open(base_path + file_name + ".ann", "r") as file:
    ann_str = file.read()

In [5]:
def parse_entities_relationships(file_content):
    """
    parses out entities and relationships from the entities file
    """
    entities = {}
    relationships = {}

    for line in file_content.strip().split('\n'):
        cols = line.split('\t')
        identifier = cols[0]

        if identifier.startswith('T'):
            entity_type, start, end = cols[1].split()
            value = cols[2]
            entities[identifier] = {'type': entity_type, 'start': int(start), 'end': int(end), 'value': value}

        elif identifier.startswith('R'):
            relationship_type, arg1, arg2 = cols[1].split()
            relationships[identifier] = {'type': relationship_type, 'arg1': arg1.split(':')[1], 'arg2': arg2.split(':')[1]}

    return entities, relationships

In [6]:
ent, rel = parse_entities_relationships(ann_str)

In [9]:
txt_str = None
with open(base_path + file_name + ".txt", "r") as file:
    txt_str = file.read()

In [12]:
# sentences = sent_tokenize(txt_str)
# print(sentences)

In [22]:
def sentence_start_end(sentences, txt_str):
    """
    get starting and ending positions of sentences
    """
    sent_positions = []
    for sent in sentences:
        start = txt_str.find(sent)
        end = start + len(sent)
        sent_positions.append((start, end))
    return sent_positions

In [24]:
sentences = sent_tokenize(txt_str)
sent_pos = sentence_start_end(sentences, txt_str)

In [30]:
def update_entity_indices(entities, sent_positions):
    """
    for each entity update its position relative to this sentence
    """
    for entity_key in entities:
        entity = entities[entity_key]
        start = entity['start']
        end = entity['end']
        for i, (sent_start, sent_end) in enumerate(sent_positions):
            if start >= sent_start and end <= sent_end:
                entity['sentence'] = i
                entity['sent_start'] = start - sent_start
                entity['sent_end'] = end - sent_start
                break
    return entities

In [33]:
ent = update_entity_indices(ent, sent_pos)

In [43]:
def get_relationship_spans(entities, relationships, txt_str):
    """
    get relationship spans
    """
    sentences = sent_tokenize(txt_str)
    entities = update_entity_indices(entities, sentence_start_end(sentences, txt_str))
#     print(entities)
    relationship_spans = {}
    processed_sentences = set()

    for rel_id, relationship in relationships.items():
        arg1 = entities[relationship['arg1']]
        arg2 = entities[relationship['arg2']]
        sent_start = min(arg1['sentence'], arg2['sentence'])
        sent_end = max(arg1['sentence'], arg2['sentence'])

        span = ' '.join(sentences[sent_start:sent_end+1])
        processed_sentences.add(sent_start)
        processed_sentences.add(sent_end)

        if span not in relationship_spans:
            relationship_spans[span] = {
                'relationships': [],
                'entities': {}
            }

        relationship_spans[span]['relationships'].append({
            'id': rel_id,
            'type': relationship['type'],
            'arg1': relationship['arg1'],
            'arg2': relationship['arg2']
        })

        relationship_spans[span]['entities'][relationship['arg1']] = arg1
        relationship_spans[span]['entities'][relationship['arg2']] = arg2

    return relationship_spans

In [44]:
spans = get_relationship_spans(ent, rel, txt_str)

In [45]:
print(spans)

{'IV nitro inititated,\nheparin, aspirin initiated and now painfree.': {'relationships': [{'id': 'R1', 'type': 'Route-Drug', 'arg1': 'T2', 'arg2': 'T3'}], 'entities': {'T2': {'type': 'Route', 'start': 1376, 'end': 1378, 'value': 'IV', 'sentence': 5, 'sent_start': 0, 'sent_end': 2}, 'T3': {'type': 'Drug', 'start': 1379, 'end': 1384, 'value': 'nitro', 'sentence': 5, 'sent_start': 3, 'sent_end': 8}}}, 'Aggressive diuresis\nwith IV lasix and diuril were unsuccessful so CVVH was\ninitiated.': {'relationships': [{'id': 'R2', 'type': 'Reason-Drug', 'arg1': 'T6', 'arg2': 'T8'}, {'id': 'R3', 'type': 'Reason-Drug', 'arg1': 'T6', 'arg2': 'T9'}, {'id': 'R4', 'type': 'Route-Drug', 'arg1': 'T7', 'arg2': 'T8'}, {'id': 'R5', 'type': 'Route-Drug', 'arg1': 'T7', 'arg2': 'T9'}], 'entities': {'T6': {'type': 'Reason', 'start': 5420, 'end': 5428, 'value': 'diuresis', 'sentence': 50, 'sent_start': 11, 'sent_end': 19}, 'T8': {'type': 'Drug', 'start': 5437, 'end': 5442, 'value': 'lasix', 'sentence': 50, 'sent_