In [43]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import json

In [2]:
base_path = "data/data_sample/"
file_name = "164726"

In [23]:
ann_str = None
with open(base_path + file_name + ".ann", "r") as file:
    ann_str = file.read()

In [59]:
def parse_entities_relationships(file_content):
    """
    parses out entities and relationships from the entities file
    """
    entities = {}
    relationships = {}

    for line in file_content.strip().split('\n'):
        cols = line.split('\t')
        identifier = cols[0]

        if identifier.startswith('T'):
            entity_type, start, end = cols[1].split()
            value = cols[2]
            entities[identifier] = {'type': entity_type, 'start': int(start), 'end': int(end), 'value': value}

        elif identifier.startswith('R'):
            relationship_type, arg1, arg2 = cols[1].split()
            relationships[identifier] = {'type': relationship_type, 'arg1': arg1.split(':')[1], 'arg2': arg2.split(':')[1]}

    return entities, relationships

In [60]:
ent, rel = parse_entities_relationships(ann_str)

In [61]:
txt_str = None
with open(base_path + file_name + ".txt", "r") as file:
    txt_str = file.read()

In [87]:
# sentences = sent_tokenize(txt_str)
# print(sentences)

In [84]:
def sentence_start_end(sentences):
    """
    get starting and ending positions of sentences
    """
    sent_positions = []
    position = 0
    for sent in sentences:
        start = position
        end = position + len(sent)
        sent_positions.append((start, end))
        position = end + 1
    return sent_positions

def update_entity_indices(entities, sent_positions):
    """
    for each entity update its position relative to this sentence
    """
    for entity_key in entities:
        entity = entities[entity_key]
        start = entity['start']
        end = entity['end']
        for i, (sent_start, sent_end) in enumerate(sent_positions):
            if(entity_key == "T11"):
                print(end, sent_end)
            if start >= sent_start and end <= sent_end:
                entity['sentence'] = i
                entity['sent_start'] = start - sent_start
                entity['sent_end'] = end - sent_start
                break
    return entities
                
def get_relationship_spans(entities, relationships, txt_str):
    """
    get relationship spans
    """
    sentences = sent_tokenize(txt_str)
    entities = update_entity_indices(entities, sentence_start_end(sentences))
#     print(entities)
    relationship_spans = {}
    processed_sentences = set()

    for rel_id, relationship in relationships.items():
        arg1 = entities[relationship['arg1']]
        arg2 = entities[relationship['arg2']]
        print(relationship['arg1'], arg1,  " 1")
        print(relationship['arg2'], arg2, " 2")
        sent_start = min(arg1['sentence'], arg2['sentence'])
        sent_end = max(arg1['sentence'], arg2['sentence'])

        span = ' '.join(sentences[sent_start:sent_end+1])
        processed_sentences.add(sent_start)
        processed_sentences.add(sent_end)

        if span not in relationship_spans:
            relationship_spans[span] = {
                'relationships': [],
                'entities': {}
            }

        relationship_spans[span]['relationships'].append({
            'id': rel_id,
            'type': relationship['type'],
            'arg1': relationship['arg1'],
            'arg2': relationship['arg2']
        })

        relationship_spans[span]['entities'][relationship['arg1']] = arg1
        relationship_spans[span]['entities'][relationship['arg2']] = arg2

    # Add "NOREL" relationships for sentences with two entities and no relations
    for i, sent in enumerate(sentences):
        if i not in processed_sentences:
            entities_in_sent = [e for e in entities.values() if e['sentence'] == i]
            if len(entities_in_sent) == 2:
                span = sent
                e1, e2 = entities_in_sent
                relationship_spans[span] = {
                    'relationships': [{'id': f'R{i}', 'type': 'NOREL', 'arg1': e1['id'], 'arg2': e2['id']}],
                    'entities': {e1['id']: e1, e2['id']: e2}
                }

    relationship_spans_json = json.dumps(relationship_spans, indent=2)
    print("Relationship Spans (JSON):", relationship_spans_json)

In [85]:
spans = get_relationship_spans(ent, rel, txt_str)

5588 790
5588 1006
5588 1086
5588 1325
5588 1375
5588 1441
5588 1522
5588 1563
5588 1615
5588 1668
5588 1710
5588 1764
5588 1789
5588 1822
5588 1912
5588 2012
5588 2296
5588 2453
5588 2466
5588 2492
5588 2505
5588 2523
5588 2536
5588 2601
5588 2617
5588 2643
5588 2716
5588 2773
5588 2819
5588 2858
5588 2879
5588 2901
5588 2938
5588 2960
5588 2983
5588 3002
5588 3059
5588 3088
5588 3798
5588 3845
5588 3913
5588 4016
5588 4069
5588 4096
5588 4122
5588 4818
5588 4890
5588 5108
5588 5329
5588 5405
5588 5491
5588 5586
5588 5739
5588 5865
5588 5963
5588 6042
5588 6189
5588 6240
5588 6316
5588 6967
5588 7079
5588 7173
5588 7292
5588 7359
5588 7384
5588 7441
T2 {'type': 'Route', 'start': 1376, 'end': 1378, 'value': 'IV', 'sentence': 5, 'sent_start': 0, 'sent_end': 2}  1
T3 {'type': 'Drug', 'start': 1379, 'end': 1384, 'value': 'nitro', 'sentence': 5, 'sent_start': 3, 'sent_end': 8}  2
T6 {'type': 'Reason', 'start': 5420, 'end': 5428, 'value': 'diuresis', 'sentence': 50, 'sent_start': 14, 'sent_

KeyError: 'sentence'

In [39]:
print(spans)

{'IV nitro inititated,\nheparin, aspirin initiated and now painfree.': {'type': 'Route-Drug', 'text': 'IV nitro inititated,\nheparin, aspirin initiated and now painfree.', 'arg1': {'id': 'T2', 'type': 'Route', 'value': 'IV', 'start': 0, 'end': 2}, 'arg2': {'id': 'T3', 'type': 'Drug', 'value': 'nitro', 'start': 3, 'end': 8}}}
