In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
import spacy

In [None]:
!python -m spacy download en_core_web_sm

# Fetching Data 

In [195]:
entities = pd.read_csv('data/entities.tsv',sep = '\t')
entities.head()

Unnamed: 0,filename,mark,label,offset1,offset2,span,code
0,es-S0212-71992007000100007-1,T1,ENFERMEDAD,40,61,arterial hypertension,38341003
1,es-S0212-71992007000100007-1,T2,ENFERMEDAD,66,79,polyarthrosis,36186002
2,es-S0212-71992007000100007-1,T3,ENFERMEDAD,1682,1698,pleural effusion,60046008
3,es-S0212-71992007000100007-1,T4,ENFERMEDAD,1859,1875,pleural effusion,60046008
4,es-S0212-71992007000100007-1,T5,ENFERMEDAD,1626,1648,lower lobe atelectasis,46621007


In [205]:
text_path = 'data/text/'
text_files = list(Path(text_path).glob('*.txt'))

In [206]:
file_text = {}

for file in text_files:
    file = str(file)
    with open(file, "r", encoding="UTF-8") as f:
        file_name = file[len(text_path):-4]
        file_text.update({file_name: f.read()})

text_files = [str(text_file)[len(text_path):-4] for text_file in text_files]

# Sentence Splitting & Tokenization

In [208]:
nlp = spacy.load("en_core_web_sm")

In [209]:
def tokenize(file_name):
    text = file_text[file_name]
    doc = nlp(text)
    sentence_ids = []
    tokens = []
    for i, sent in enumerate(doc.sents):
        sentence_ids.append(i)
        tokens.append([(tk.text, tk.idx) for tk in sent])
    return sentence_ids, tokens

# Converting Spans to IOB Format

In [286]:
def get_bio_tags(tokens, entities):
    tags = []
    curr_entity = None
    for token in tokens:
        if len(entities):
            nxt_entity = entities[0]
            start, end, lbl = nxt_entity[0], nxt_entity[1], nxt_entity[2]
            if token[1] >= start and (token[1] + len(token)) <= end:
                if curr_entity:
                    tags.append('I-' + lbl)
                else:
                    tags.append('B-' + lbl)
                    curr_entity = nxt_entity
                if (token[1] + len(token)) >= end:
                    curr_entity = None
                    entities.pop(0)
            else:
                if token[1] >= end:
                    entities.pop(0)
                tags.append('O')
                curr_entity = None
        else:
            tags.append('O')
    return tags

In [287]:
file_info = ['text', 'entities', 'tags', 'sentence_ids', 'tokens']

In [288]:
res = {}
for info in file_info:
    res[info] = []
    for file_num in range(len(text_files)):
        if info == 'text':
            res[info].append("")
        else:
            res[info].append([])
        
file_idx = {}
for idx, file in enumerate(text_files):
    file_idx[file] = idx
        
for entity in entities.itertuples():
    file = entity[1]
    idx = file_idx[file]
    if res['text'][idx] == "":
        res['text'][idx] = file_text[file]
        res['sentence_ids'][idx], res['tokens'][idx] = tokenize(file)
    # entity -> offset1, offset2, label, span
    res['entities'][idx].append([entity[4], entity[5], entity[3], entity[6]])
    
for idx, file in enumerate(text_files):
    text = file_text[file_name]
    doc = nlp(text)
    ents = res['entities'][idx].copy()
    for i in res['sentence_ids'][idx]:
        res['tags'][idx].append(get_bio_tags(res['tokens'][idx][i], ents))

In [289]:
file_no = 1
sentence_no = 1

In [290]:
res['tokens'][file_no][sentence_no]

[('The', 139),
 ('first', 143),
 ('of', 149),
 ('the', 152),
 ('corpses', 156),
 (',', 163),
 ('corresponding', 165),
 ('to', 179),
 ('the', 182),
 ('female', 186),
 (',', 192),
 ('was', 194),
 ('referred', 198),
 ('with', 207),
 ('the', 212),
 ('clinical', 216),
 ('judgement', 225),
 ('of', 235),
 ('severe', 238),
 ('respiratory', 245),
 ('failure', 257),
 ('with', 265),
 ('suspected', 270),
 ('Potter', 280),
 ("'s", 286),
 ('Syndrome', 289),
 ('and', 298),
 ('severe', 302),
 ('oligohydramnios', 309),
 (';', 324),
 ('she', 326),
 ('was', 330),
 ('born', 334),
 ('by', 339),
 ('emergency', 342),
 ('caesarean', 352),
 ('section', 362),
 ('due', 370),
 ('to', 374),
 ('breech', 377),
 ('presentation', 384),
 ('and', 397),
 ('the', 401),
 ('Apgar', 405),
 ('test', 411),
 ('was', 416),
 ('1/3/7', 420),
 (';', 425),
 ('minutes', 427),
 ('later', 435),
 ('she', 441),
 ('died', 445),
 ('.', 449)]

In [291]:
res['tags'][file_no][sentence_no]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ENFERMEDAD',
 'I-ENFERMEDAD',
 'I-ENFERMEDAD',
 'O',
 'O',
 'B-ENFERMEDAD',
 'I-ENFERMEDAD',
 'I-ENFERMEDAD',
 'O',
 'B-ENFERMEDAD',
 'I-ENFERMEDAD',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [292]:
res['sentence_ids'][file_no]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [293]:
res['entities'][file_no]

[[238, 264, 'ENFERMEDAD', 'severe respiratory failure'],
 [280, 297, 'ENFERMEDAD', "Potter's Syndrome"],
 [302, 324, 'ENFERMEDAD', 'severe oligohydramnios'],
 [556, 568, 'ENFERMEDAD', 'micrognathia'],
 [1057, 1062, 'ENFERMEDAD', 'cysts'],
 [1907, 1912, 'ENFERMEDAD', 'cysts'],
 [2004, 2009, 'ENFERMEDAD', 'cysts'],
 [2158, 2163, 'ENFERMEDAD', 'cysts'],
 [2262, 2267, 'ENFERMEDAD', 'cysts'],
 [2332, 2337, 'ENFERMEDAD', 'cysts'],
 [2252, 2267, 'ENFERMEDAD', 'medullary cysts'],
 [1004, 1021, 'ENFERMEDAD', 'cystic formations']]

In [294]:
res['text'][file_no]

"Two newborns, male and female from the same mother, died at 10 and 45 minutes of life respectively, and underwent post-mortem examination. The first of the corpses, corresponding to the female, was referred with the clinical judgement of severe respiratory failure with suspected Potter's Syndrome and severe oligohydramnios; she was born by emergency caesarean section due to breech presentation and the Apgar test was 1/3/7; minutes later she died. External examination revealed a subcyanotic colour, triangular facies with mongoloid parpebral fissures, micrognathia, broad nasal root and prominent occiput. The abdomen, globular, hard and slightly dented, allowed the palpation of two large masses occupying both renal fossae and hemiabdomenes. When the cavities were opened, the presence of two large renal masses measuring 10 x 8 x 5.5 cm and 12 x 8 x 6 cm with weights of 190 and 235 g respectively, stood out. Although the renal silhouette could be discerned, the surface, dented, showed nume

In [295]:
import json

dump_file = 'data/processed_inp_data.json'

with open(dump_file, 'w') as f:
     f.write(json.dumps(res))

In [296]:
# with open(dump_file) as json_file:
#     data = json.load(json_file)

# data['tags'][file_no][sentence_no]