# Parse i2b2 data to CoNLL format for BLSTM-CNN model

In [1]:
import os, re, shutil

In [2]:
os.makedirs('./data')
os.makedirs('./data/entries')
os.makedirs('./data/unfiltered_annotations')

Place the i2b2 folders "annotations_ground_truth", "training_ground_truth", and "train.test.released.8.17.09" in "data". Copy all files in "annotations_ground_truth/converted.noduplicates.sorted/" (except 'bucharest_v1.py') and in "training_ground_truth" (except 'CHANGE.HISTORY.txt') to "unfiltered_annotations"

In [4]:
dir_name = "./data/unfiltered_annotations/" 
data = os.listdir(dir_name)

for filename in data:
    try:
        base = re.split(r'\D', filename)[0]
        os.rename(dir_name+filename, dir_name+base)
    except:
        pass

Manually rename any files that were skipped, and remove "827931.Alan.Aronson.NIH.NLM.a1_827931.Faisal.Farooq.Siemens.a2.Kim.m.save", which is a temporary file. The same file name without the .save extension is the definitive version of this file.

In [5]:
#automatically move all text files that have been annotated from "train.test.released.8.17.09" to "entries"
for filename in os.listdir('./data/unfiltered_annotations/'):
    shutil.copy('./data/train.test.released.8.17.09/'+filename, './data/entries/'+filename)

In [6]:
#For each annotated file, create a new file in which all irrelevant attributes, i.e. all but 'm', 'du', and 'r', are removed.
#All files are stored in the folder "annotations"

os.makedirs('./data/annotations/')

duration_pattern = 'du="nm"'
reason_pattern = 'r="nm"'
    
for f in os.listdir('./data/unfiltered_annotations'):
    with open('./data/unfiltered_annotations/'+f, 'r') as r, open('./data/annotations/'+f, 'w') as w:
        for l in r: 
            filtered_entry = l.split("||")[0]+"||"+l.split("||")[4]+"||"+l.split("||")[5]+"\n"    
            w.write(filtered_entry)

The code that follows is available at: https://github.com/mxhofer/i2b2_2009-to-CoNLL

In [2]:
import nltk
import numpy as np
import pandas as pd

In [3]:
a_ids = []
e_ids = []

for filename in os.listdir("./data/annotations"):
    if filename[0] != ".":  # ignore hidden files
        a_ids.append(filename)
for filename in os.listdir("./data/entries"):
    if filename[0] != ".": 
        e_ids.append(filename)
    
a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))

intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Success: all anotations have a corresponding entry.", len(intersection))

Success: all anotations have a corresponding entry. 261


#### Build corpora

In [4]:
# build annotation and entry corpora

a_corpus = []
e_corpus = []

# only annotations and corresponding files
for file in a_ids:
    path = "./data/annotations/" + str(file)
    with open(path) as f:
        content = f.read().splitlines()
        a_corpus.append(content)

    path = "./data/entries/" + str(file)
    with open(path) as f:
        #content = f.readlines()
        content = f.read().splitlines()
        e_corpus.append(content)

#### Set up dataframe

In [5]:
#  ["id", "row", "offset", "word", "POS", "chunk", "NER"]
entries_cols = ["id", "row", "offset", "word"]
entries_df = pd.DataFrame(columns=entries_cols)

In [6]:
annotations_cols = ["id", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)

#### Number of annotations

In [7]:
med_count = 0
dur_count = 0
reason_count = 0

for document in a_corpus:
    for line in document:
        if "m=\"nm\"" not in line:
            med_count += 1
        if "du=\"nm\"" not in line:
            dur_count += 1
        if "r=\"nm\"" not in line:
            reason_count += 1
        
print("Medication annotations: ", med_count)
print("Duration annotations: ", dur_count)
print("Reason annotations: ", reason_count)

Medication annotations:  9318
Duration annotations:  571
Reason annotations:  1694


#### Build annotations data frame

In [8]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # reset df
tmp_list = []

for i, document in enumerate(a_corpus):
    
    for row in document:
        row = row.split("||")
        # print(row, "\n")
        
        for tag in row: 
            # print(tag)
            tag = tag.split("=")
            if ":" in tag[1]:
                tag_label = tag[0].lstrip(" ")
                tag_row_a = tag[1].split(" ")[-2:][0].split(":")[0]
                tag_row_b = tag[1].split(" ")[-2:][1].split(":")[0]
                
                # some annotations have non-standard formatting (losing 64 instances)
                try:
                    tag_offset_a = int(tag[1].split(" ")[-2:][0].split(":")[1])
                    tag_offset_b = int(tag[1].split(" ")[-2:][1].split(":")[1])
                    length = tag_offset_b - tag_offset_a + 1
                    
                    # 1 row = 1 token with a tag
                    first = True
                    BIO_tag = "B-"
                    if length > 1 and tag_row_a == tag_row_b:
                        for offset in range(tag_offset_a, tag_offset_b+1):
                            if first: 
                                tag_label = BIO_tag + tag_label
                                first = False
                            else:
                                tag_label = tag_label.replace("B-", "I-")
                            tmp_list.append([a_ids[i], tag_label, tag_row_a, offset, 1])
                    # TODO: tags over line breaks
                    else:
                        tmp_list.append([a_ids[i], BIO_tag + tag_label, tag_row_a, tag_offset_a, length])
                except:
                    pass             

annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df.reset_index(inplace=True)                  

In [9]:
annotations_df = annotations_df.drop(columns=["index", "length"])
annotations_df.shape

(19090, 4)

In [10]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,106650,B-m,12,0
1,106650,I-m,12,1
2,106650,B-m,23,0
3,106650,B-du,23,6
4,106650,I-du,23,7


In [11]:
entries_df.head()

Unnamed: 0,id,row,offset,word


#### Build entries data frame

List of token modifications:
    - "|": ignored
    - "." removed from end of token

In [12]:
entries_df = pd.DataFrame(columns=entries_cols)  # reset df
tmp_list = []

for doc_i, document in enumerate(e_corpus):
    
    tmp_list.append([0, 0, 0, "-DOCSTART-"])
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])
    
    for row_i, row in enumerate(document):
        row_split = row.split(" ")
        for word_i, word in enumerate(row_split):
            word = word.rstrip(".")  # strip "." from end of word
            word = word.replace("\t", "")
            word_id = a_ids[doc_i]
            word_row = row_i+1  # 1-based indexing 
            word_offset = word_i # 0-based indexing
            
            if len(word) > 0 and "|" not in word:
                tmp_list.append([word_id, word_row, word_offset, word])
        
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])

entries_df = pd.DataFrame(tmp_list, columns=entries_cols)

In [13]:
entries_df.head()

Unnamed: 0,id,row,offset,word
0,0,0,0,-DOCSTART-
1,0,0,0,-EMPTYLINE-
2,106650,1,0,RECORD
3,106650,1,1,#106650
4,106650,2,0,912344838


In [14]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,106650,B-m,12,0
1,106650,I-m,12,1
2,106650,B-m,23,0
3,106650,B-du,23,6
4,106650,I-du,23,7


In [15]:
ner_counter = [1 for i in annotations_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

11562 named entities


#### Join entries and annotations

In [16]:
# ensure correct dtypes
annotations_df[['id', 'row', 'offset']] = annotations_df[['id', 'row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)
entries_df[['id', 'row', 'offset']] = entries_df[['id', 'row', 'offset']].apply(pd.to_numeric)
entries_df["word"] = entries_df["word"].astype(str)

In [17]:
result_df = pd.merge(entries_df, annotations_df, how="left", on=['id', 'row', 'offset'])

In [18]:
# replace NaNs with "O"
print("columns with missing data:\n", result_df.isna().any())
result_df = result_df.fillna("O")

columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag     True
dtype: bool


In [19]:
print("columns with missing data:\n", result_df.isna().any())

columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag    False
dtype: bool


In [20]:
result_df = result_df.drop(columns=["id", "row", "offset"])
result_df.head()

Unnamed: 0,word,NER_tag
0,-DOCSTART-,O
1,-EMPTYLINE-,O
2,RECORD,O
3,#106650,O
4,912344838,O


In [21]:
result_df.shape

(295121, 2)

In [22]:
# 71 fewer annotations than expected as annotations over line breaks are not included
ner_counter = [1 for i in result_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

11557 named entities


#### POS tagger

In [23]:
from nltk.chunk.regexp import RegexpChunkParser, ChunkRule, RegexpParser
from nltk.tree import Tree

In [24]:
text = result_df["word"].tolist()
text_pos = nltk.pos_tag(text)
text_pos_list = [i[1] for i in text_pos]

In [25]:
len(text_pos_list)

295121

In [26]:
result_df.columns

Index(['word', 'NER_tag'], dtype='object')

In [27]:
result_df["POS_tag"] = text_pos_list

In [28]:
result_df.head()

Unnamed: 0,word,NER_tag,POS_tag
0,-DOCSTART-,O,JJ
1,-EMPTYLINE-,O,NN
2,RECORD,O,NNP
3,#106650,O,VBZ
4,912344838,O,CD


#### CoNLL chunk tagger

In [29]:
text_test = "EU rejects German call to boycott British lamb.".split(" ")
text_pos_test = nltk.pos_tag(text_test)

In [30]:
text_pos_test

[('EU', 'NNP'),
 ('rejects', 'VBZ'),
 ('German', 'JJ'),
 ('call', 'NN'),
 ('to', 'TO'),
 ('boycott', 'VB'),
 ('British', 'JJ'),
 ('lamb.', 'NN')]

used for building regex 
grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase
    PP: {<IN><NP>}               # prepositional phrase
    VP: {<MD>?<VB.*><NP|PP>}     # verb phrase
    CLAUSE: {<NP><VP>}           # full clause
"""

#### Noun phrases

In [31]:
rule_0 = ChunkRule("<DT>?<JJ.*>*<NN.*>+", "More complete chunk NP sequences")

chunk_parser_np = RegexpChunkParser([rule_0],chunk_label='NP')

chunk_result_tree_np = chunk_parser_np.parse(text_pos)

In [32]:
chunk_tag_np = []

for i in chunk_result_tree_np:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_np.append("B-" + i.label())
            else:
                chunk_tag_np.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_np.append("O")

In [33]:
len(chunk_tag_np) == result_df.shape[0]  # check that chunk col has same length

True

#### Verb phrases

In [34]:
rule_1 = ChunkRule("<VBD|IN|\.>", "Verb phrases")

chunk_parser_vp = RegexpChunkParser([rule_1],chunk_label='VP')

chunk_result_tree_vp = chunk_parser_vp.parse(text_pos)

In [35]:
chunk_tag_vp = []

for i in chunk_result_tree_vp:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_vp.append("B-" + i.label())
            else:
                chunk_tag_vp.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_vp.append("O")

In [36]:
len(chunk_tag_np) == result_df.shape[0] == len(chunk_tag_vp)

True

In [37]:
# augment chunk tags with verb phrase tags
for i, entry in enumerate(chunk_tag_np):
    if entry == "O":
        chunk_tag_np[i] = chunk_tag_vp[i]

In [38]:
result_df["chunk_tag"] = chunk_tag_np

In [39]:
result_df = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']]  # order columns

In [40]:
result_df.shape

(295121, 4)

In [41]:
result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']] = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']].astype(str)
result_df.dtypes

word         object
POS_tag      object
chunk_tag    object
NER_tag      object
dtype: object

In [42]:
result_df.shape

(295121, 4)

In [43]:
result_df = result_df.reindex()

In [44]:
len(result_df)

295121

In [45]:
result_df.tail()

Unnamed: 0,word,POS_tag,chunk_tag,NER_tag
295116,6/10,CD,O,O
295117,T:,NNP,B-NP,O
295118,1/22,CD,O,O
295119,[report_end],NNP,B-NP,O
295120,-EMPTYLINE-,NN,I-NP,O


#### Write to txt

In [46]:
np.savetxt("data_blstm_cnn.txt", result_df.values, fmt="%s")

#### Split data 

In [47]:
devnum = 231650 #record 895892
testnum = 261005 #record 944118

with open('data_blstm_cnn.txt') as f, open('train_blstm_cnn.txt', 'w') as train, open('dev_blstm_cnn.txt', 'w') as dev, open('test_blstm_cnn.txt', 'w') as test:
    x = f.readlines()
    for i, l in enumerate(x):
        if i < devnum:
            train.write(l)
        elif devnum <= i < testnum:
            dev.write(l)
        elif testnum <= i:
            test.write(l)

Copy "train_blstm_crf.txt", "dev_blstm_cnn.txt", and "test_blstm_cnn.txt" to "Named_Entity_Recognition-BidirectionalLSTM-CNN-CoNLL-master/data/"

# Parse i2b2 data to CoNLL format for BLSTM-CRF model

In [1]:
import nltk, os
import numpy as np
import pandas as pd

In [2]:
a_ids = []
e_ids = []

for filename in os.listdir("./data/annotations"):
    if filename[0] != ".":  # ignore hidden files
        a_ids.append(filename)
for filename in os.listdir("./data/entries"):
    if filename[0] != ".": 
        e_ids.append(filename)
    
a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))

intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Success: all anotations have a corresponding entry.", len(intersection))

Success: all anotations have a corresponding entry. 261


In [3]:
# build annotation and entry corpora

a_corpus = []
e_corpus = []

# only annotations and corresponding files
for file in a_ids:
    path = "./data/annotations/" + str(file)
    with open(path) as f:
        content = f.read().splitlines()
        a_corpus.append(content)

    path = "./data/entries/" + str(file)
    with open(path) as f:
        #content = f.readlines()
        content = f.read().splitlines()
        e_corpus.append(content)

#### Set dataframes

In [4]:
#  ["id", "row", "offset", "word", "POS", "chunk", "NER"]
entries_cols = ["id", "row", "offset", "word"]
entries_df = pd.DataFrame(columns=entries_cols)

In [5]:
annotations_cols = ["id", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)

#### Build dataframes

In [6]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # reset df
tmp_list = []

for i, document in enumerate(a_corpus):
    
    for row in document:
        row = row.split("||")
        
        for tag in row: 
            tag = tag.split("=")
            if ":" in tag[1]:
                tag_label = tag[0].lstrip(" ")
                tag_row_a = tag[1].split(" ")[-2:][0].split(":")[0]
                tag_row_b = tag[1].split(" ")[-2:][1].split(":")[0]
                
                # some annotations have non-standard formatting (losing 64 instances)
                try:
                    tag_offset_a = int(tag[1].split(" ")[-2:][0].split(":")[1])
                    tag_offset_b = int(tag[1].split(" ")[-2:][1].split(":")[1])
                    length = tag_offset_b - tag_offset_a + 1
                    
                    # 1 row = 1 token with a tag
                    first = True
                    BIO_tag = "B-"
                    if length > 1 and tag_row_a == tag_row_b:
                        for offset in range(tag_offset_a, tag_offset_b+1):
                            if first: 
                                tag_label = BIO_tag + tag_label
                                first = False
                            else:
                                tag_label = tag_label.replace("B-", "I-")
                            tmp_list.append([a_ids[i], tag_label, tag_row_a, offset, 1])
                    # TODO: tags over line breaks
                    else:
                        tmp_list.append([a_ids[i], BIO_tag + tag_label, tag_row_a, tag_offset_a, length])
                except:
                    pass             

annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df.reset_index(inplace=True)  

In [7]:
annotations_df.head()

Unnamed: 0,index,id,NER_tag,row,offset,length
0,0,106650,B-m,12,0,1
1,1,106650,I-m,12,1,1
2,2,106650,B-m,23,0,1
3,3,106650,B-du,23,6,1
4,4,106650,I-du,23,7,1


In [8]:
annotations_df = annotations_df.drop(columns=["index", "length"])

In [9]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,106650,B-m,12,0
1,106650,I-m,12,1
2,106650,B-m,23,0
3,106650,B-du,23,6
4,106650,I-du,23,7


In [10]:
entries_df = pd.DataFrame(columns=entries_cols)  # reset df
tmp_list = []

for doc_i, document in enumerate(e_corpus):
    
    tmp_list.append([0, 0, 0, "-DOCSTART-"])
    
    for line_i, line in enumerate(document):
        row_split = line.split(" ")
        row_split = [(word_i, word) for word_i, word in enumerate(row_split)]
        for word_i, word in row_split:
            word = word.rstrip(".")  
            word = word.replace("\t", "")
            word_id = a_ids[doc_i]
            word_row = line_i+1
            word_offset = word_i
            
            if len(word) > 0 and "|" not in word:
                tmp_list.append([word_id, word_row, word_offset, word])
        
entries_df = pd.DataFrame(tmp_list, columns=entries_cols)

In [11]:
entries_df.head()

Unnamed: 0,id,row,offset,word
0,0,0,0,-DOCSTART-
1,106650,1,0,RECORD
2,106650,1,1,#106650
3,106650,2,0,912344838
4,106650,2,2,CDHMC


#### Join dataframes

In [12]:
# ensure correct dtypes
annotations_df[['id', 'row', 'offset']] = annotations_df[['id', 'row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)
entries_df[['id', 'row', 'offset']] = entries_df[['id', 'row', 'offset']].apply(pd.to_numeric)
entries_df["word"] = entries_df["word"].astype(str)

In [13]:
result_df = pd.merge(entries_df, annotations_df, how="left", on=['id', 'row', 'offset'])

In [14]:
# replace NaNs with "O"
result_df = result_df.fillna("O")

In [15]:
result_df[:10]

Unnamed: 0,id,row,offset,word,NER_tag
0,0,0,0,-DOCSTART-,O
1,106650,1,0,RECORD,O
2,106650,1,1,#106650,O
3,106650,2,0,912344838,O
4,106650,2,2,CDHMC,O
5,106650,2,4,57611498,O
6,106650,2,7,246241,O
7,106650,2,9,1/1/1995,O
8,106650,2,10,12:00:00,O
9,106650,2,11,AM,O


#### POS tags

In [16]:
from nltk.chunk.regexp import RegexpChunkParser, ChunkRule, RegexpParser
from nltk.tree import Tree

In [17]:
text = result_df["word"].tolist()
text_pos = nltk.pos_tag(text)
text_pos_list = [i[1] for i in text_pos]

In [18]:
result_df["POS_tag"] = text_pos_list

#### Chunk tags

In [19]:
rule_0 = ChunkRule("<DT>?<JJ.*>*<NN.*>+", "More complete chunk NP sequences")

chunk_parser_np = RegexpChunkParser([rule_0],chunk_label='NP')

chunk_result_tree_np = chunk_parser_np.parse(text_pos)

In [20]:
chunk_tag_np = []

for i in chunk_result_tree_np:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_np.append("B-" + i.label())
            else:
                chunk_tag_np.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_np.append("O")

In [21]:
len(chunk_tag_np) == result_df.shape[0]  # check that chunk col has same length

True

In [22]:
rule_1 = ChunkRule("<VBD|IN|\.>", "Verb phrases")

chunk_parser_vp = RegexpChunkParser([rule_1],chunk_label='VP')

chunk_result_tree_vp = chunk_parser_vp.parse(text_pos)

In [23]:
chunk_tag_vp = []

for i in chunk_result_tree_vp:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_vp.append("B-" + i.label())
            else:
                chunk_tag_vp.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_vp.append("O")

In [24]:
len(chunk_tag_np) == result_df.shape[0] == len(chunk_tag_vp)

True

In [25]:
# augment chunk tags with verb phrase tags
for i, entry in enumerate(chunk_tag_np):
    if entry == "O":
        chunk_tag_np[i] = chunk_tag_vp[i]

In [26]:
result_df["chunk_tag"] = chunk_tag_np

In [27]:
result_df.head()

Unnamed: 0,id,row,offset,word,NER_tag,POS_tag,chunk_tag
0,0,0,0,-DOCSTART-,O,NN,B-NP
1,106650,1,0,RECORD,O,NNP,I-NP
2,106650,1,1,#106650,O,VBZ,O
3,106650,2,0,912344838,O,CD,O
4,106650,2,2,CDHMC,O,NNP,B-NP


In [28]:
result_df = result_df[['word', 'id', 'row', 'offset', 'POS_tag', 'chunk_tag', 'NER_tag']]  # order columns

In [29]:
result_df[['word', 'id', 'row', 'offset', 'POS_tag', 'chunk_tag', 'NER_tag']] = result_df[['word', 'id', 'row', 'offset', 'POS_tag', 'chunk_tag', 'NER_tag']].astype(str)
result_df.dtypes

word         object
id           object
row          object
offset       object
POS_tag      object
chunk_tag    object
NER_tag      object
dtype: object

In [30]:
result_df = result_df.reindex()

In [31]:
result_df.drop(columns=['id', 'offset'], inplace=True)

In [32]:
np.savetxt("data_blstm_crf.txt", result_df.values, fmt="%s")

In [38]:
devnum = 231240 # record 895892
testnum = 260540 # record 944118

with open('data_blstm_crf.txt') as f, open('train_blstm_crf.txt', 'w') as train, open('dev_blstm_crf.txt', 'w') as dev, open('test_blstm_crf.txt', 'w') as test:
    x = f.readlines()
    prev_line_id = 0
    for i, l in enumerate(x):
        
        if i < devnum:
            if l.split()[-4] != prev_line_id and i != 0:
                prev_line_id = l.split()[-4]
                train.write('\n')
                train.write(l)
            else:
                train.write(l)
                
        elif devnum <= i < testnum:
            if l.split()[-4] != prev_line_id and i != 235846:
                prev_line_id = l.split()[-4]
                dev.write('\n')
                dev.write(l)
            else:
                dev.write(l)
                
        else:
            if l.split()[-4] != prev_line_id and i != 265278:
                prev_line_id = l.split()[-4]
                test.write('\n')
                test.write(l)
            else:
                test.write(l)