Notebook for training i2b2 2010 dataset


In [1]:
import numpy as np
import os
import tensorflow as tf
import string
import random
import math
import sys

from ner_model import NerModel
from dataset_encoder import DatasetEncoder
from ner_model_saver import NerModelSaver

In [2]:
embeddings_file = '/home/saif/Downloads/PubMed-shuffle-win-2.bin'
i2b2_folder = '/home/saif/Downloads/i2b2/'

In [3]:
config_proto = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
list(config_proto.SerializeToString())

[56, 1, 64, 1]

In [4]:
tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)

[]

In [7]:
# returns array of sentences, each contains array of tokens
def read_texts(file):
    with open(file, encoding="utf-8") as f:
        for line in f:
            words = line.strip().split(' ')
            yield words

def read_concepts(file):
    with open(file) as f:
        for line in f:
            left, right = line.strip().split('||')
            tokens = left.split(' ')
            start = tokens[-2]
            end = tokens[-1]
            
            start_line, start_token = [int(x) for x in start.split(':')]
            end_line, end_token = [int(x) for x in end.split(':')]
            assert(start_line == end_line)
            line = start_line
            
            t, tag = right.split('=')
            assert(t == 't')
            tag = tag.strip('"')           
            
            yield (line, start_token, end_token, tag)
            

# Iterator of sentences. Each sentence is an array of pairs (word, tag)
def make_annotated_sentences(sentences, annotations):
    tags = {}
    
    for (line, start_token, end_token, tag) in annotations:
        for token in range(start_token, end_token + 1):
            bio_tag = "B-" + tag if token == start_token else "I-" + tag
            tags[(line, token)] = bio_tag
    
    line = 0
    for sentence in sentences:
        line += 1
        result = []
        
        for i in range(len(sentence)):
            token = sentence[i]
            tag = tags.get((line, i), "O")
            result.append((token, tag))
        
        yield result


# Iterator of senteces, each sentence is an array of pairs (word, tag)
def read_i2b2_dataset(folders):
    
    for folder in folders:
        text_folder = folder + "txt/"
        concept_folder = folder + "concept/"
        
        for file in os.listdir(text_folder):
            if file[-4:] != ".txt":
                continue
                
            # remove txt
            file = file[: -4]
            text_file = text_folder + file + ".txt"
            concept_file = concept_folder +file + ".con"
            
            sentences = read_texts(text_file)            
            annotations = list(read_concepts(concept_file))
            
            for sentence in make_annotated_sentences(sentences, annotations):
                yield sentence     

   

In [8]:
import gensim

# Word Embeddings
model = gensim.models.KeyedVectors.load_word2vec_format(
    embeddings_file, 
    binary=True,
    limit=1000000)

import collections
normalize_tokens_for_embeddings = False
#words = collections.OrderedDict({DatasetEncoder.normalize(w):w for w in model.vocab})
words = collections.OrderedDict({w:w for w in model.vocab})

vocab = list(words.keys())
id2word = collections.OrderedDict({i+1: w for i,w in enumerate(vocab)})
word2id = collections.OrderedDict({w:i for i,w in id2word.items()})

def get_normalized_or_normal(target):
    if normalize_tokens_for_embeddings:
        try:
            v = model.get_vector(DatasetEncoder.normalize(target))
            v /= np.linalg.norm(v, 2)
            return v
        except KeyError:
            v = model.get_vector(target)
            v /= np.linalg.norm(v, 2)
            return v
    else:
        return model.get_vector(target)

embeddings = [[0]*200] + [get_normalized_or_normal(words[id2word[i]]) for i in range(1, len(words) + 1)]

# Add word out of the vocabulary
word2id['__oov__'] = 0
id2word[0] = '__oov__'
words['__oov__'] = '__oov__'

# i2b2 reading
train_dataset_folder = i2b2_folder + 'concept_assertion_relation_training_data/'
sentences = read_i2b2_dataset([train_dataset_folder + "beth/", train_dataset_folder + "partners/"])
train_dataset = list(sentences)

valid_dataset_folder = i2b2_folder + 'reference_standard_for_test_data/'
sentences = read_i2b2_dataset([valid_dataset_folder])
valid_dataset = list(sentences)

In [9]:
import numpy as np
v = model.get_vector("with")
v / np.linalg.norm(v, 2) 

array([-0.00320693,  0.00167004, -0.09126581, -0.11574854, -0.04394112,
       -0.07961337, -0.13876739,  0.03070446,  0.05947306, -0.01522299,
       -0.09660824,  0.06576782, -0.22819473, -0.01563095, -0.03132185,
       -0.05822439, -0.08672199,  0.1991438 , -0.05447187,  0.1072481 ,
       -0.12158737, -0.04751258,  0.06938139,  0.01554571, -0.07477523,
        0.05796184, -0.14733596,  0.10301121,  0.18611129,  0.14711392,
       -0.02997275, -0.01465039, -0.06597033,  0.03484017,  0.10930625,
       -0.12020653,  0.0046996 ,  0.12969127,  0.05813777,  0.07814306,
       -0.04783545,  0.1214288 , -0.01741104, -0.10013006,  0.05751835,
       -0.02224303,  0.10574778, -0.09843226,  0.07615267,  0.0214475 ,
        0.0073724 ,  0.04157292,  0.04980931,  0.03333236, -0.06057598,
        0.01574951,  0.06154851,  0.04370131, -0.05727746, -0.00469313,
        0.0741053 , -0.09775556, -0.0806613 ,  0.06985603,  0.02253323,
        0.029452  ,  0.02044853, -0.02627305, -0.02689816,  0.07

In [10]:
tags = set()

for sentence in train_dataset:
    for item in sentence:
        tags.add(item[1])
        
print(tags)

{'B-treatment', 'B-test', 'I-test', 'B-problem', 'O', 'I-problem', 'I-treatment'}


In [11]:
encoder = DatasetEncoder(word2id, embeddings)
train = list(encoder.encode(train_dataset))
valid = list(encoder.encode(valid_dataset))

In [12]:
def words_in_embeddings(dataset):
    zero = 0
    other = 0
    for sentence in dataset:
        for word_id in sentence["word_ids"]:
            if word_id == 0:
                zero += 1
            else:
                other += 1
    
    return (zero, other)

(zero, other) = words_in_embeddings(valid)
print('words without embeddings coverage: {}'.format(zero / (zero + other)))

words without embeddings coverage: 0.05923922396055457


In [13]:
ner = NerModel()

ner.add_cnn_char_repr(dim=25, nfilters=30)
ner.add_pretrained_word_embeddings(200)
ner.add_context_repr(8, 200)
ner.add_inference_layer(False)
ner.add_training_op(5.0)

ner.init_variables()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use the `axis` argument instead
Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
for i in range(0, 110):
    ner.train(train, 
          valid, 
          lr = 0.2,
          po = 0.05,
          batch_size = 180,
          dropout = 0.6,
          epoch_start = i, 
          epoch_end = i + 1
    )
    
    if (i + 1) % 10 == 0:
        saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)
        saver.save('i2b2_model_non-normalized-drop_{}'.format(i))

In [None]:
ner.predicted_labels.name

In [None]:
"""
saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)
saver.save('i2b2_model')

saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)
saver.save2('i2b2_asd')
"""

In [None]:
#tf.saved_model.loader.load(export_dir="i2b2_ss_model", tags=['serve'], sess=ner.session)

In [12]:
NerModelSaver.restore_tensorflow_state(ner.session, 'i2b2_model_normalized_109')

prec, rec, f1 = ner.measure(train)    
print("train metrics: prec: {}, rec: {}, f1: {}".format(prec, rec, f1))

prec, rec, f1 = ner.measure(valid)    
print("valid metrics: prec: {}, rec: {}, f1: {}".format(prec, rec, f1))

train metrics: prec: 0.9356028451833855, rec: 0.919586857701824, f1: 0.9275257178508727
valid metrics: prec: 0.8451262784387393, rec: 0.8121615157920723, f1: 0.8283160495381372


In [13]:
ner.char_ids.name

'char_repr/char_ids:0'

In [None]:
# converts tags in format BIO: B-"tag", I-"tag" to list with (begin, end, tag) tags
def bio2be(source, tuples = False):
    result = []
    for i in range(len(source)):
        sentence = source[i]
        
        last_start = None
        last_tag = None
        for j in range(len(sentence)):
            tag = sentence[j]
            if last_tag and (tag.startswith("B-") or tag == "O"):
                # close last tag
                item = [i, last_start, j - 1, last_tag, '', '']
                item = tuple(item) if tuples else item
                result.append(item)
                last_tag = None
                last_start = None
            
            if tag.startswith("B-") or (tag.startswith("I-") and last_tag is None):
                last_tag = tag[2:]
                last_start = j
                
        if last_tag:
            # close last tag in sentence
            item = [i, last_start, len(sentence) - 1, last_tag, '', '']
            item = tuple(item) if tuples else item
            result.append(item)
            last_tag = None
            last_start = None

    
    return result                

def decode_tags(id2tag, tag_ids):
    result = []
    for i in range(len(tag_ids)):
        sentence = []
        for j in range(len(tag_ids[i])):
            tag_id = tag_ids[i][j]
            sentence.append(id2tag[tag_id])
        
        result.append(sentence)
    
    return result
    

In [None]:
import re
def normalize_line(line):
    return re.sub(r'[^\w\s$]',' ', line).strip()

def read_test_dataset(file='benefit-summary.txt'):
    with open(file) as f:
        content = list([normalize_line(line) for line in f.readlines()])
    return list([list([(word.strip(), "unknown") for word in line.split()]) for line in content])

def read_test_lines(target):
    content = list([normalize_line(line) for line in target])
    return list([list([(word.strip(), "unknown") for word in line.split()]) for line in content])


def save_dataset(dataset, file):
    with open(file, 'w') as f:
        for line in dataset:
            words = list([word for (word, tag) in line])
            f.write(' '.join(words))
            f.write('\n')

def save_prediction(prediction, file):
    with open(file, 'w') as f:
        f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format('line', 'start', 'end', 'tag', 'text', 'sentence'))
        for item in prediction:
            f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(item[0], item[1], item[2], item[3], item[4], item[5]))

def add_text_for_tags(predictions, dataset):
    for prediction in predictions:
        line = prediction[0]
        start = prediction[1]
        end = prediction[2]

        words = dataset[line]['words'][start:end + 1]
        prediction[4] = ' '.join(words)
        prediction[5] = ' '.join(dataset[line]['words'])

In [None]:
#text_dataset = read_test_dataset()
text_dataset = read_test_lines([
    "With regard to the patient 's chronic obstructive pulmonary disease , the patient 's respiratory status improved throughout the remainder of her hospital course ."
])
dataset = list(encoder.encode(text_dataset, True))
print(len(dataset[0]['char_ids']))

predicted = ner.predict(dataset, 1, 0.7)   
print(predicted)
id2tag = {tag_id:tag for tag, tag_id in encoder.tag2id.items()}
print(id2tag)
tags_predicted = list(bio2be(decode_tags(id2tag, predicted)))
add_text_for_tags(tags_predicted, dataset)

save_dataset(text_dataset, 'clean_data.txt')
save_prediction(tags_predicted, 'prediction_09.csv')