In [1]:
import glob
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer


import json
import re

In [44]:
def parse_entities_relationships(file_content):
    """
    parses out entities and relationships from the entities file
    """
    entities = {}
    relationships = {}

    for line in file_content.strip().split('\n'):
        cols = line.split('\t')
        identifier = cols[0]

        if "Arg1:" in cols[1]:
            relationship_type, arg1, arg2 = cols[1].split()
            relationships[identifier] = {
                'type': relationship_type,
                'arg1': arg1.split(':')[1],
                'arg2': arg2.split(':')[1]
            }
            
        else:
            split_cols = cols[1].split()
            entity_type = split_cols[0]
            value = cols[2]
            start = split_cols[1]
            end = split_cols[-1]
            entities[identifier] = {
                'type': entity_type,
                'start': int(start),
                'end': int(end),
                'value': value
            }
    return entities, relationships


def sentence_start_end(txt_str):
    """
    get starting and ending positions of sentences
    """
    sent_tokenizer = PunktSentenceTokenizer(txt_str)
    sentences = sent_tokenizer.tokenize(txt_str)
    curr_start = 0
    sent_positions = []
    for i in range(len(sentences)):
        sent = sentences[i]
        match_idx = txt_str.find(sent, curr_start)
        start = match_idx
        end = match_idx + len(sent)
        curr_start = end
        sent_positions.append((start, end))
    return sent_positions, sentences


def update_entity_indices(entities, sent_positions):
    """
    for each entity update its position relative to this sentence
    """
    for entity_key in entities:
        entity = entities[entity_key]
        start = entity['start']
        end = entity['end']
        for i, (sent_start, sent_end) in enumerate(sent_positions):
            if start >= sent_start and end <= sent_end:
                entity['sentence'] = i
                entity['sent_start'] = start - sent_start
                entity['sent_end'] = end - sent_start
                break
        assert 'sentence' in entity, f"{entity_key}"
    return entities


def get_relationship_spans(entities, relationships, txt_str):
    """
    get relationship spans
    """
    entities = update_entity_indices(entities, sentence_start_end(txt_str)[0])

    relationship_spans = {}

    for rel_id, relationship in relationships.items():
        arg1 = entities[relationship['arg1']]
        arg2 = entities[relationship['arg2']]
        if arg1['sentence'] <= arg2['sentence']:
            sent_start = arg1['sentence']
            sent_end = arg2['sentence']
            arg1_start = arg1['sent_start']
            arg1_end = arg1['sent_end']
            offset = len(' '.join(sentences[sent_start: sent_end]))
            arg2_start = offset + arg2['sent_start']
            arg2_end = offset + arg2['sent_end']
        else:
            sent_start = arg2['sentence']
            sent_end = arg1['sentence']
            arg2_start = arg2['sent_start']
            arg2_end = arg2['sent_end']
            offset = len(' '.join(sentences[sent_start: sent_end]))
            arg1_start = offset + arg1['sent_start']
            arg1_end = offset + arg1['sent_end']
        
        span = ' '.join(sentences[sent_start:sent_end+1])
        
#         print(len(span[arg1_start:arg1_end]), len(arg1['value']))
#         print(span[arg1_start:arg1_end], arg1['value'])
#         print(len(span[arg2_start:arg2_end]), len(arg2['value']))
#         print(span[arg2_start:arg2_end], arg2['value'])
#         assert (
#             (span[arg1_start:arg1_end] == arg1['value']) and 
#             (span[arg2_start:arg2_end] == arg2['value'])
#         )

        if span not in relationship_spans:
            relationship_spans[span] = {
                'relationships': {},
                'all_entities': {}
            }

        relationship_spans[span]['relationships'][rel_id] = {
            'type': relationship['type'],
            'arg1': relationship['arg1'],
            'arg1_start': arg1_start,
            'arg1_end': arg1_end,
            'arg2': relationship['arg2'],
            'arg2_start': arg2_start,
            'arg2_end': arg2_end
        }
        if relationship['arg1'] not in relationship_spans[span]['all_entities']:
            ent_data = {
                'start_pos': arg1_start,
                'end_pos': arg1_end,
                'text': arg1['value'],
                'type': arg1['type']
            }
            relationship_spans[span]['all_entities'][relationship['arg1']] = ent_data
        
        if relationship['arg2'] not in relationship_spans[span]['all_entities']:
            ent_data = {
                'start_pos': arg2_start,
                'end_pos': arg2_end,
                'text': arg2['value'],
                'type': arg2['type']
            }
            relationship_spans[span]['all_entities'][relationship['arg2']] = ent_data
    return relationship_spans


def get_uncovered_entity_spans(entities_map, relations, txt_str):
    updated_entities = update_entity_indices(
        entities_map, sentence_start_end(txt_str)[0]
    )
    ents_covered = set()
    for v in relations.values():
        ents_covered = ents_covered.union({v['arg1']}).union({v['arg2']})
        
    uncovered_dict = {k:v for k, v in updated_entities.items() if k not in ents_covered}

    spans = {}
    for ent, loc in uncovered_dict.items():
        sent = sentences[loc['sentence']]
        if sent not in spans:
            spans[sent] = {}
            
        ent_data = {
            'start_pos': loc['sent_start'],
            'end_pos': loc['sent_end'],
            'text': loc['value'],
            'type': loc['type']
        }
        spans[sent][ent] = ent_data
    return spans


def tag_index(text, ent_type, ent_text, ent_loc):
    index_tag_map = {}
    tokens = np.array(nltk.word_tokenize(text))
    ent_tokens = nltk.word_tokenize(ent_text)
    possible_loc = np.where(tokens==ent_tokens[0])[0]
    ent_start_loc = possible_loc[np.argmin(
        [abs(ent_loc - len(' '.join(tokens[0:x]))) for x in possible_loc]
    )]
    ent_end_loc = ent_start_loc + len(ent_tokens) - 1
    if len(ent_tokens) == 1:
        index_tag_map[ent_start_loc] = f"S-{ent_type}"
    else:
        for i in range(ent_start_loc, ent_end_loc):
            if i == ent_start_loc:
                index_tag_map[i] = f"B-{ent_type}"
            elif i == ent_end_loc - 1:
                index_tag_map[i] = f"E-{ent_type}"
            else:
                index_tag_map[i] = f"I-{ent_type}"
    return index_tag_map, ent_start_loc, ent_end_loc


def get_token_tags(text, all_ents, all_rels=None):
    tokens = nltk.word_tokenize(text)
    tags = ['O'] * len(tokens)
    tag_dict = {}
    ent_dict = {}
    for eid, val in all_ents.items():
        tmp, s_idx, e_idx = tag_index(
            text, val['type'], val['text'], val['start_pos']
        )
        tag_dict = tag_dict | tmp
        ent_dict[eid] = {
            'start_idx': s_idx,
            'end_idx': e_idx
        }
    for i, t in tag_dict.items():
        tags[i] = t
        
    if all_rels:
        rels = all_rels.copy()
        rel_lst = []
        for _, rdata in rels.items():
            tokens_str = "|".join(tokens)
            arg1_idx = (f"{ent_dict[rdata['arg1']]['start_idx']}"
            f":{ent_dict[rdata['arg1']]['end_idx']}")
            arg2_idx = (f"{ent_dict[rdata['arg1']]['start_idx']}"
            f":{ent_dict[rdata['arg1']]['end_idx']}")
            rel_lst.append([tokens_str, arg1_idx, arg2_idx])
        
        ner_df = pd.DataFrame(
            list(zip(tokens, tags)), columns=['token', 'tag']
        ) 
        rels_df = pd.DataFrame(rel_lst, columns=['text', 'arg1', 'arg2'])
        return ner_df, rels_df
    else:
        ner_df = pd.DataFrame(
            list(zip(tokens, tags)), columns=['token', 'tag']
        ) 
        return ner_df

def parse_data(path):
    anns = sorted(glob.glob(f"{path}./*.ann"))
    files = sorted(glob.glob(f"{path}./*.txt"))
    df1_lst = []
    df2_lst = []
    for a, f in zip(anns, files):
        idx = a.split('\\')[-1].split('.')[0]
        assert idx == f.split('\\')[-1].split('.')[0]
        with open(a, 'r') as a_f:
            ann_str = a_f.read()
        with open(f, 'r') as f_f:
            txt_str = f_f.read()
        try:
            ent, rel = parse_entities_relationships(ann_str)
            rel_spans = get_relationship_spans(ent, rel, txt_str)
        except Exception as e:
            print(idx)
            raise ValueError
        df_ner_lst = []
        df_rel_lst = []
        tmp = []
        for i, (s, v) in enumerate(rel_spans.items()):
            try:
                df1, df2 = get_token_tags(s, v['all_entities'], v['relationships'])
                df1['sid'] = i
                df1['contains_rel'] = 1
                df2['sid'] = i
                df_ner_lst.append(df1)
                df_rel_lst.append(df2)
            except:
                print(f"{idx}: {s}")
                tmp.append((s, v))
        uncovered_ent_spans = get_uncovered_entity_spans(ent, rel, txt_str)
        for i, (s, v) in enumerate(uncovered_ent_spans.items()):
            try:
                df1 = get_token_tags(s, v)
                df1['sid'] = i
                df1['contains_rel'] = 0
                df_ner_lst.append(df1)
            except:
                print(f"{idx}: {s}")
                tmp.append((s, v))
        df_ner_per_rec = pd.concat(df_ner_lst)
        df_ner_per_rec['uid'] = idx
        df1_lst.append(df_ner_per_rec)
        df_rel_per_rec = pd.concat(df_rel_lst)
        df_rel_per_rec['uid'] = idx
        df2_lst.append(df_rel_per_rec)

    df_ner = pd.concat(df1_lst)
    df_rel = pd.concat(df2_lst)
    return df_ner, df_rel 

In [39]:
# base_path = "../../data/test/"
# file_name = "164726"
base_path = "../../data/training/"
file_name = "101215"

In [71]:
for i in range(ent['T59']['end'], 1000000):
    if txt_str[i] == '.':
        end_loc = i
        break

for i in range(ent['T57']['start'], 0, -1):
    if txt_str[i] == '.':
        start_loc = i    
        break
print(txt_str[start_loc: end_loc])

. Potassium Chloride 20 mEq Tab Sust.Rel. Particle/Crystal Sig:
One (1) Tab Sust


In [61]:
ent['T59']['end']

5670

In [68]:
txt_str[5692] == ''

'.'

In [64]:
txt_str[ent['T59']['start']: ent['T59']['end']+1000]

'Tab Sust.Rel. Particle/Crystal Sig:\nOne (1) Tab Sust.Rel. Particle/Crystal PO once a day for 7 days.\nDisp:*7 Tab Sust.Rel. Particle/Crystal(s)* Refills:*0*\n7. Pantoprazole 40 mg Tablet, Delayed Release (E.C.) Sig: One\n(1) Tablet, Delayed Release (E.C.) PO Q24H (every 24 hours).\nDisp:*30 Tablet, Delayed Release (E.C.)(s)* Refills:*0*\n8. Labetalol 200 mg Tablet Sig: Four (4) Tablet PO TID (3 times\na day).\nDisp:*360 Tablet(s)* Refills:*0*\n\n\nDischarge Disposition:\nExtended Care\n\nFacility:\n[**Hospital3 1107**] [**Hospital **] Hospital - [**Location (un) 38**]\n\nDischarge Diagnosis:\nType A Aortic Dissection, Aortic Insufficiency - s/p Repair\nPostoperative Bleeding/Pericardial Effusion - s/p Re-exploration\nPostoperative Toxic-Metabolic Encephalopathy\nHypertension\nDyslipidemia\n\n\nDischarge Condition:\nGood\n\n\nDischarge Instructions:\n1)Please shower daily. No baths. Pat dry incisions, do not rub.\n2)Avoid creams and lotions to surgical incisions.\n3)Call cardiac surge

In [40]:
ann_str = None
with open(base_path + file_name + ".ann", "r") as file:
    ann_str = file.read()

In [41]:
txt_str = None
with open(base_path + file_name + ".txt", "r") as file:
    txt_str = file.read()

In [45]:
a, b = sentence_start_end(txt_str)

In [48]:
ent, rel = parse_entities_relationships(ann_str)

In [53]:
rel

{'R1': {'type': 'Reason-Drug', 'arg1': 'T2', 'arg2': 'T3'},
 'R2': {'type': 'Route-Drug', 'arg1': 'T4', 'arg2': 'T3'},
 'R4': {'type': 'Reason-Drug', 'arg1': 'T8', 'arg2': 'T7'},
 'R5': {'type': 'Reason-Drug', 'arg1': 'T9', 'arg2': 'T7'},
 'R8': {'type': 'Strength-Drug', 'arg1': 'T15', 'arg2': 'T14'},
 'R9': {'type': 'Frequency-Drug', 'arg1': 'T16', 'arg2': 'T14'},
 'R10': {'type': 'Strength-Drug', 'arg1': 'T19', 'arg2': 'T18'},
 'R11': {'type': 'Frequency-Drug', 'arg1': 'T21', 'arg2': 'T20'},
 'R12': {'type': 'Strength-Drug', 'arg1': 'T23', 'arg2': 'T22'},
 'R13': {'type': 'Form-Drug', 'arg1': 'T24', 'arg2': 'T22'},
 'R14': {'type': 'Dosage-Drug', 'arg1': 'T25', 'arg2': 'T22'},
 'R15': {'type': 'Form-Drug', 'arg1': 'T26', 'arg2': 'T22'},
 'R16': {'type': 'Route-Drug', 'arg1': 'T27', 'arg2': 'T22'},
 'R17': {'type': 'Frequency-Drug', 'arg1': 'T28', 'arg2': 'T22'},
 'R18': {'type': 'Form-Drug', 'arg1': 'T30', 'arg2': 'T29'},
 'R19': {'type': 'Strength-Drug', 'arg1': 'T31', 'arg2': 'T29'

In [49]:
ent['T59']

{'type': 'Form',
 'start': 5640,
 'end': 5670,
 'value': 'Tab Sust.Rel. Particle/Crystal'}

In [51]:
a

[(0, 427),
 (428, 645),
 (646, 684),
 (685, 768),
 (770, 879),
 (880, 909),
 (910, 960),
 (961, 990),
 (992, 1045),
 (1047, 1161),
 (1163, 1204),
 (1205, 1254),
 (1256, 1304),
 (1306, 1329),
 (1330, 1382),
 (1384, 1422),
 (1424, 1527),
 (1528, 1633),
 (1634, 1681),
 (1682, 1722),
 (1723, 1771),
 (1772, 1962),
 (1963, 2014),
 (2015, 2068),
 (2069, 2119),
 (2120, 2152),
 (2153, 2202),
 (2203, 2281),
 (2282, 2368),
 (2369, 2454),
 (2455, 2492),
 (2493, 2541),
 (2542, 2615),
 (2616, 2648),
 (2649, 2780),
 (2784, 3302),
 (3303, 3369),
 (3370, 3445),
 (3446, 3507),
 (3508, 3612),
 (3613, 3669),
 (3670, 3751),
 (3752, 3831),
 (3832, 3934),
 (3935, 4028),
 (4029, 4119),
 (4120, 4240),
 (4241, 4342),
 (4343, 4409),
 (4410, 4532),
 (4533, 4585),
 (4586, 4640),
 (4641, 4683),
 (4685, 4745),
 (4747, 4770),
 (4772, 4825),
 (4827, 4888),
 (4890, 5053),
 (5054, 5153),
 (5154, 5156),
 (5157, 5278),
 (5279, 5281),
 (5282, 5345),
 (5346, 5380),
 (5381, 5459),
 (5460, 5504),
 (5505, 5587),
 (5588, 5613),

In [42]:
rel_spans = get_relationship_spans(ent, rel, txt_str)

AssertionError: T59

In [137]:
df_ner_lst = []
df_rel_lst = []
tmp = []
# for i, (s, v) in enumerate(rel_spans.items()):
#     try:
#         df1, df2 = get_token_tags(s, v['all_entities'], v['relationships'])
#         df1['sid'] = i
#         df1['contains_rel'] = 1
#         df2['sid'] = i
#         df_ner_lst.append(df1)
#         df_rel_lst.append(df2)
#     except Exception as e:
#         print("*****")
#         print(e)
#         print("*****")
#         print(f"{100035}: {s}")
#         tmp.append((s, v))

for i, (s, v) in enumerate(uncovered_ent_spans.items()):
    try:
        df1 = get_token_tags(s, v)
        df1['sid'] = i
        df1['contains_rel'] = 0
        df_ner_lst.append(df1)
    except Exception as e:
        print("*****")
        print(e)
        print("*****")
        print(f"{100035}: {s}")
        tmp.append((s, v))


In [10]:
parse_data("../../data/training/")

100035: # Ventilator associated pneumonia:  Patient developed a fever on
[**2-27**] with new infiltrates on chest xray while intubated. He was
initially covered with vanc/cefepime and cipro.
100035: He completed an 8 day course of
Vanco/Cefepime.
100035: 3. acetaminophen 325 mg Tablet Sig: Two (2) Tablet PO Q6H (every
6 hours) as needed for pain/fever.
100035: 11. acetaminophen 500 mg Tablet Sig: Two (2) Tablet PO TID (3
times a day) as needed for pain/fever.
100035: Your mental status
slowly improved, though you did have 2 seizures, last on [**3-18**]. You were started ons eizure medications for this.
100039: Brief Hospital Course:
38 yo F w/ h/o ALL in remission s/p cord transplant in [**1-13**],
anthracycline-induced cardiomyopathy (EF 15-20% [**1-14**]) and
recurrent nausea and vomiting who presents with 1 week abd pain,
acute on chronic renal failure and new hyperbilirubinemia.
100039: # Anthracycline-induced/ GVHD cardiomyopathy:  EF <20% on echo
from 2/[**2174**].
100039: She wa

ValueError: 

In [161]:
a = "../../data/training/100187.ann"
f = "../../data/training/100187.txt"
with open(a, 'r') as a_f:
    ann_str = a_f.read()
with open(f, 'r') as f_f:
    txt_str = f_f.read()

In [163]:
ent, rel = parse_entities_relationships(ann_str)

In [None]:
print(txt_str[:17279])

In [None]:
ent['T277']

In [None]:
spans

In [None]:
# t = 'Aggressive diuresis\nwith IV lasix and diuril were unsuccessful so CVVH was\ninitiated.'
# get_token_tags(t, spans[t]['all_entities'], spans[t]['relationships'])

In [None]:
def process_sentences_without_relationships(sentences, sent_positions, entities, relationships, relationship_spans):
    processed_sentences = {entities[rel['arg1']]['sentence'] for rel in relationships.values()}
    processed_sentences.update({entities[rel['arg2']]['sentence'] for rel in relationships.values()})
    for i, (sent_start, sent_end) in enumerate(sent_positions):
        if i not in processed_sentences:
            entities_in_sent = [e for e in entities.values() if e['sentence'] == i]
            if len(entities_in_sent) > 1:
                span = sentences[i]

                if span not in relationship_spans:
                    relationship_spans[span] = {
                        'relationships': [],
                        'entities': {}
                    }

                for j, (e1, e2) in enumerate(zip(entities_in_sent[:-1], entities_in_sent[1:])):
                    print(e1['id'], e2['id'])
                    relationship_spans[span]['relationships'].append({
                        'id': f'NOREL{i}_{j}',
                        'type': 'NOREL',
                        'arg1': e1['id'],
                        'arg2': e2['id']
                    })

                for e in entities_in_sent:
                    relationship_spans[span]['entities'][e['id']] = e
    return relationship_spans

In [None]:
for e, v in ent.items():
    v['sentence']

In [None]:
rel_spans = get_relationship_spans(ent, rel, txt_str)

In [117]:
nltk.word_tokenize(s)

['#',
 'Ventilator',
 'associated',
 'pneumonia',
 ':',
 'Patient',
 'developed',
 'a',
 'fever',
 'on',
 '[',
 '*',
 '*',
 '2-27',
 '*',
 '*',
 ']',
 'with',
 'new',
 'infiltrates',
 'on',
 'chest',
 'xray',
 'while',
 'intubated',
 '.',
 'He',
 'was',
 'initially',
 'covered',
 'with',
 'vanc/cefepime',
 'and',
 'cipro',
 '.']

In [118]:
s

'# Ventilator associated pneumonia:  Patient developed a fever on\n[**2-27**] with new infiltrates on chest xray while intubated. He was\ninitially covered with vanc/cefepime and cipro.'

In [115]:
s[162:170]

'/cefepim'

In [105]:
df_ner_lst = []
df_rel_lst = []
for i, (s, v) in enumerate(rel_spans.items()):
    df1, df2 = get_token_tags(s, v['all_entities'], v['relationships'])
    df1['sid'] = i
    df1['contains_rel'] = 1
    df2['sid'] = i
    df_ner_lst.append(df1)
    df_rel_lst.append(df2)

ValueError: attempt to get argmin of an empty sequence

In [87]:
get_token_tags(s, rel_spans[s]['all_entities'], rel_spans[s]['relationships'])[1]

{'R1': {'type': 'Duration-Drug',
  'arg1': 'T1',
  'arg1_start': 5,
  'arg1_end': 5,
  'arg2': 'T2',
  'arg2_start': 8,
  'arg2_end': 8}}

In [60]:
uncovered_ent_spans = get_uncovered_entity_spans(ent, rel, txt_str)

In [67]:
s1 = list(uncovered_ent_spans.keys())[0]

'The patient had persistent symptoms and was\nstarted on levofloxacin by his PCP two days prior to admission.'

In [69]:
get_token_tags(s1, uncovered_ent_spans[s1])

[('The', 'O'),
 ('patient', 'O'),
 ('had', 'O'),
 ('persistent', 'O'),
 ('symptoms', 'O'),
 ('and', 'O'),
 ('was', 'O'),
 ('started', 'O'),
 ('on', 'O'),
 ('levofloxacin', 'S-Drug'),
 ('by', 'O'),
 ('his', 'O'),
 ('PCP', 'O'),
 ('two', 'O'),
 ('days', 'O'),
 ('prior', 'O'),
 ('to', 'O'),
 ('admission', 'O'),
 ('.', 'O')]