In [2]:
import nltk
# nltk.download('punkt')
import numpy as np
from nltk.tokenize import sent_tokenize
import json
import itertools
import re
import os
from collections import defaultdict

In [3]:
# base_path = "../../data/test/"
# file_name = "164726"
base_path = "../../../train/"

In [4]:
all_files = os.listdir(base_path)
filenames = set([name.split(".")[0] for name in all_files])

In [43]:
def parse_entities_relationships(file_content):
    """
    parses out entities and relationships from the entities file
    """
    entities = {}
    relationships = {}

    for line in file_content.strip().split('\n'):
        # madarchod line might be annotator comment
        if line.strip().startswith("#"):
            continue
        cols = line.split('\t')
        identifier = cols[0]
        if "Arg1:" in cols[1]:
            relationship_type, arg1, arg2 = cols[1].split()
            relationships[identifier] = {
                'type': relationship_type,
                'arg1': arg1.split(':')[1],
                'arg2': arg2.split(':')[1]
            }
            
        else:
            split_cols = cols[1].split()
            entity_type = split_cols[0]
            value = cols[2]
            start = split_cols[1]
            end = split_cols[-1]
            entities[identifier] = {
                'type': entity_type,
                'start': int(start),
                'end': int(end),
                'value': value
            }

    return entities, relationships

In [57]:
def get_spans(entities, relationships, txt_str, split_char="\n"):
    """
    first pass through spans
    """
    spans = {}
    for rel in relationships.keys():
        arg1 = entities[relationships[rel]['arg1']]
        arg2 = entities[relationships[rel]['arg2']]
        min_left = min(arg1['start'], arg2['start'])
        max_right = max(arg1['end'], arg2['end'])
        # find dots
        prev_char = txt_str[:min_left].rfind(split_char)+1
        next_char = txt_str[max_right:].find(split_char)+max_right
        # see if no exist
        assert(prev_char != -1)
        assert(next_char != -1)
        # see that span is not insanely large
        if max_right-min_left > 10:
            assert((next_char-prev_char) <= 20*(max_right-min_left))
        # return spans
        spans[rel] = {
            'start':prev_char,
            'end':next_char
        }
    return spans

In [58]:
def get_ents_in_spans(spans, ents):
    span_ent = defaultdict(list)
    for span in spans.keys():
        s = spans[span]
        for ent in ents.keys():
            e = ents[ent]
            if s['start'] <= e['start'] and s['end'] >= e['end']:
                span_ent[span].append(ent)
    return span_ent

In [59]:
ent

{'T1': {'type': 'Drug', 'start': 171, 'end': 179, 'value': 'fentanyl'},
 'T2': {'type': 'Drug', 'start': 182, 'end': 191, 'value': 'midazolam'},
 'T3': {'type': 'Dosage', 'start': 4852, 'end': 4853, 'value': '3'},
 'T4': {'type': 'Drug', 'start': 4854, 'end': 4858, 'value': 'PRBC'},
 'T5': {'type': 'Form', 'start': 4859, 'end': 4871, 'value': 'transfusions'},
 'T6': {'type': 'Form', 'start': 5365, 'end': 5369, 'value': 'unit'},
 'T7': {'type': 'Drug', 'start': 5373, 'end': 5378, 'value': 'pRBCs'},
 'T8': {'type': 'Drug', 'start': 6077, 'end': 6086, 'value': 'lorazepam'},
 'T9': {'type': 'Drug', 'start': 6838, 'end': 6850, 'value': 'Pantoprazole'},
 'T10': {'type': 'Strength', 'start': 6851, 'end': 6856, 'value': '40 mg'},
 'T11': {'type': 'Route', 'start': 6857, 'end': 6859, 'value': 'PO'},
 'T12': {'type': 'Frequency', 'start': 6860, 'end': 6864, 'value': 'Q24H'},
 'T13': {'type': 'Drug', 'start': 6869, 'end': 6881, 'value': 'pantoprazole'},
 'T14': {'type': 'Strength', 'start': 6882,

In [60]:
final_dict = {}
# CHECKING FOR ALL FILES MA CHUDAO
for file_name in filenames:
    print(file_name)
    # read annotations
    ann_str = None
    with open(base_path + file_name + ".ann", "r") as file:
        ann_str = file.read()
    # parse entities and relations
    ent, rel = parse_entities_relationships(ann_str)
    # parse text string
    txt_str = None
    with open(base_path + file_name + ".txt", "r") as file:
        txt_str = file.read()
    # first pass through spans
    spans = get_spans(ent, rel, txt_str, "\n")
    # second pass through spans
    span_ents = get_ents_in_spans(spans, ent)
    # make final relationship dict
    for rel_key in spans.keys():
        span = spans[rel_key]
        span_text = txt_str[span['start']:span['end']]
        # make relationship dict for this span
        rel_dict = {}
        rel_dict["type"] = rel[rel_key]["type"]
        rel_dict["arg1"] = rel[rel_key]["arg1"]
        rel_dict["arg1_start"] = ent[rel[rel_key]["arg1"]]["start"]-span["start"]+1
        rel_dict["arg1_end"] = ent[rel[rel_key]["arg1"]]["end"]-span["start"]+1
        rel_dict["arg2"] = rel[rel_key]["arg2"]
        rel_dict["arg2_start"] = ent[rel[rel_key]["arg2"]]["start"]-span["start"]+1
        rel_dict["arg2_end"] = ent[rel[rel_key]["arg2"]]["end"]-span["start"]+1
        # make ent dict
        ent_dict = {}
        for e in span_ents[rel_key]:
            ent_dict[e] = {}
            ent_dict[e]["start_pos"] = ent[e]["start"]-span["start"]+1
            ent_dict[e]["end_pos"] = ent[e]["end"]-span["start"]+1
            ent_dict[e]["text"] = ent[e]["value"]
            ent_dict[e]["type"] = ent[e]["type"]
        if span_text not in final_dict:
            final_dict[span_text] = {"relationships":{}}
        span_rels = final_dict[span_text]["relationships"]
        
        span_rels[rel_key] = rel_dict
        final_dict[span_text]["all_entities"] = ent_dict

109566
102045
123589
118694
104095
102629
186134
188551
115347
118418
112014
101276
115157
130440
109770
121514
130076
105106
182160
125206
112615
103677
118564
168915
102019
112628
102557
124307
120605
117872
100883
117871
197869
105050
106415
106629
130153
114004
115021
157284
110037
152612
111923
107872
113705
174150
109617
104107
196798
150068
103317
174143
105614
192798
108932
114452
109191
195689
114144
195784
107047
101813
122733
117907
114976
102053
145012
107251
114513
102283
100039
115267
110068
115191
115244
106038
103142
115464
131903
187852
118510
106384
105254
190056
128652
112747
103926
109527
122468
104850
187736
178331
182003
176106
110342
102027
104979
115143
115667
111323
108809
147292
104682
102937
114923
123475
105014
115232
101410
102136
113824
101779
105954
109633
100187
117609
104653
112832
109170
101215
127356
102087
190444
101136
108889
166834
107255
102365
108032
117156
149614
127944
153141
143451
102173
102698
106939
116901
106064
161384
172474
110521
104710

In [61]:
final_dict

{'4. clopidogrel 75 mg Tablet Sig: One (1) Tablet PO once a day.': {'relationships': {'R1': {'type': 'Dosage-Drug',
    'arg1': 'T1',
    'arg1_start': 34,
    'arg1_end': 41,
    'arg2': 'T228',
    'arg2_start': 4,
    'arg2_end': 15},
   'R2': {'type': 'Form-Drug',
    'arg1': 'T2',
    'arg1_start': 42,
    'arg1_end': 48,
    'arg2': 'T228',
    'arg2_start': 4,
    'arg2_end': 15},
   'R3': {'type': 'Route-Drug',
    'arg1': 'T3',
    'arg1_start': 49,
    'arg1_end': 51,
    'arg2': 'T228',
    'arg2_start': 4,
    'arg2_end': 15},
   'R4': {'type': 'Frequency-Drug',
    'arg1': 'T4',
    'arg1_start': 52,
    'arg1_end': 62,
    'arg2': 'T228',
    'arg2_start': 4,
    'arg2_end': 15},
   'R168': {'type': 'Strength-Drug',
    'arg1': 'T229',
    'arg1_start': 16,
    'arg1_end': 21,
    'arg2': 'T228',
    'arg2_start': 4,
    'arg2_end': 15},
   'R169': {'type': 'Form-Drug',
    'arg1': 'T230',
    'arg1_start': 22,
    'arg1_end': 28,
    'arg2': 'T228',
    'arg2_start': 4,
