# Annotation Pipeline for DataTurks .tsv

In [1]:
%reset -f

In [2]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Padding Sequences
4. Mapping to Integer Ids
5. Formatting Data for Keras LSTM Model
6. Train Test Split
7. Specifying Model and Model Parameters
8. Training Model
9. Evaluating Model
10. Saving Model Results

### 1. Reading and Formatting File:

The file being used is the raw output of a data turks annotated tsv file.

More info Available:
https://dataturks.com/features/document-ner-annotation.php

In [3]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [4]:
file = "./Medical NER V2.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['"In', 'O'],
 ['the', 'O'],
 ['Hospital3', 'GEO'],
 ['Emergency', 'GEO'],
 ['Room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['CPAP', 'O2 Saturation'],
 ['"', 'O'],
 [''],
 ['She', 'O'],
 ['was', 'O'],
 ['not', 'DOS'],
 ['able', 'DOS'],
 ['to', 'DOS']]

Some words (such as the first above) contained a number and quote before it so these are removed with the following function.

In [5]:
def clean_words(word_ents):
    '''removes quote and comma characters from'''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0].lower()
        if word.find(',') > 0:
            word = word[word.find(',')+1:]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [6]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'GEO'],
 ['emergency', 'GEO'],
 ['room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['cpap', 'O2 Saturation'],
 ['', 'O'],
 [''],
 ['she', 'O'],
 ['was', 'O'],
 ['not', 'DOS'],
 ['able', 'DOS'],
 ['to', 'DOS']]

Dataturks uses a blank line to seperate each sequence. This is why most csv/tsv readers cannot read the file. The following function will split each sequence when it finds a blank line in the tsv file.

In [7]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [8]:
seqs = create_seqs(new_ents)
seqs[:1]

[[['in', 'O'],
  ['the', 'O'],
  ['hospital3', 'GEO'],
  ['emergency', 'GEO'],
  ['room', 'GEO'],
  [',', 'O'],
  ['her', 'O'],
  ['oxygen', 'O'],
  ['saturation', 'O'],
  ['was', 'O'],
  ['100%', 'O2 Saturation'],
  ['on', 'O2 Saturation'],
  ['cpap', 'O2 Saturation']]]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

This scheme adds more context to the tags and allows annotations to make more sense.

In [9]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [10]:
cleaned_tag_seqs = [clean_tags(ents) for ents in seqs]
cleaned_tag_seqs[0]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'B-GEO'],
 ['emergency', 'I-GEO'],
 ['room', 'I-GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'B-O2 Saturation'],
 ['on', 'I-O2 Saturation'],
 ['cpap', 'I-O2 Saturation']]

### 3. Padding Sequences to a Specified Length

In order to be usable by the LSTM model, each sequence needs to be padded/truncated to the same length. Here 50 is chosen somewhat arbitraily but is around the 97th percentile of sequence lengths.

In [11]:
def pad_seq(seq,max_len):
    padded_seq = seq+[["<PAD>","O"]]*max_len
    return padded_seq[:max_len]
    
def pad_sequences(sequences,max_len=None):
    if max_len == None:
        max_len = max(len(seq) for seq in sequences)
    return [pad_seq(seq,max_len) for seq in sequences]

In [12]:
max_len = 50
padded_seqs = pad_sequences(cleaned_tag_seqs,max_len)
padded_seqs[0]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'B-GEO'],
 ['emergency', 'I-GEO'],
 ['room', 'I-GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'B-O2 Saturation'],
 ['on', 'I-O2 Saturation'],
 ['cpap', 'I-O2 Saturation'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O']]

### 4. Mapping Words to Integer Values for Model Training

The model can't use words so each one is mapped to a particular index.

In [13]:
def get_word_ids(sentances,tag=False):
    words = []
    for sentance in sentances:
        words += list([word[tag] for word in sentance])
    word_dict = {word:i for i,word in enumerate(set(words))}
    return word_dict

In [14]:
word_ids = get_word_ids(padded_seqs)
tag_ids = get_word_ids(padded_seqs,tag=True)
print(list(word_ids.items())[:4])
print(list(tag_ids.items())[:4])

[('cm2', 0), ('id', 1), ('atrium', 2), ('256', 3)]
[('B-Route', 0), ('I-Test / Screening', 1), ('B-O2 Saturation', 2), ('I-O2 Saturation', 3)]


In [15]:
def words_to_ids(sentances,word_ids,tag_ids):
    vector = []
    for sentance in sentances:
        vector.append(list([[word_ids[w[0]],tag_ids[w[1]]] for w in sentance]))
    return np.array(vector)

Now the words are given a numeric representation which can be mapped back to the original words.

In [16]:
vectors = words_to_ids(padded_seqs,word_ids,tag_ids)
print(vectors[0][:4])
print('')
print("Word Representation:")
print(padded_seqs[0][:4])

[[ 871    8]
 [ 236    8]
 [1193   10]
 [ 460   11]]

Word Representation:
[['in', 'O'], ['the', 'O'], ['hospital3', 'B-GEO'], ['emergency', 'I-GEO']]


Now we can label our features (x) and labels (y) for training.

In [17]:
def create_x_y(matrix,n_tags):
    x = []
    y = []
    for sequences in matrix:
        xi = [i[0] for i in sequences]
        yi = [i[1] for i in sequences]
        x.append(xi)
        y.append(yi)
    y = np.array([to_categorical(i,n_tags) for i in y])
    return np.array(x),np.array(y)

In [18]:
n_tags = len(tag_ids)
x,y = create_x_y(vectors,n_tags)
print("X-shape:",x.shape)
print(x[0][:5])
print('')
print("Y-shape:",y.shape)
print(y[0][:5])

X-shape: (1008, 50)
[ 871  236 1193  460  751]

Y-shape: (1008, 50, 48)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


### 5. Train Test Split 

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 6. Specifying Model and Model Parameters

https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/

#### Use Word2Vec Embedding Trained on Entire Corpus:

In [20]:
def create_weight_matrix(word_ids,embeddings_index):
    embedding_matrix = np.zeros((len(word_ids),100))
    count = 0
    oov_words = []
    for word,idx in word_ids.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
        else:
            oov_words.append(word)
    return embedding_matrix,oov_words    

custom_emb = Word2Vec.load(".word2vec.model")
embed_matrix,oov_words = create_weight_matrix(word_ids,custom_emb);

FileNotFoundError: [Errno 2] No such file or directory: '.word2vec.model'

In [21]:
# OOV Words
print(f"Percent OOV: {len(oov_words)/len(word_ids)*100}%")
print(oov_words[:30])

NameError: name 'oov_words' is not defined

Because of how some words are split there are still some words out of vocabulary even though the Word2Vec embeddings were trained on the full  version of the same corpus. Each of these OOV words will simply have 0 weights in the matrix. The 0.9% of OOV words is a huge improvement over what was seen using Glove embeddings which saw 13% of the vocab being OOV words. 

In [None]:
def create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_weights):
    model = Sequential()
    model.add(Embedding(n_words,
                        embedding_size,
                        weights=[embed_weights],
                        trainable=False,
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)))
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
    return model

In [None]:
embedding_size = 100
n_words = len(word_ids)
n_tags = len(tag_ids)

model = create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_matrix)
model.summary()

### 7. Training Model

In [None]:
def train_model(model,x_train,y_train,batch_size=32,epochs=20,val_split = 0.1):

    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.0001,
                               patience=3,
                               mode='min',
                               verbose=1)
    
    
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(x_train, y_train, 
                        batch_size=32, 
                        epochs=epochs, 
                        validation_split=val_split, 
                        verbose=1,
                        callbacks=[early_stop]
                       )
    return history

In [None]:
epochs = 50
batch_size = 32
history = train_model(model,x_train,y_train,batch_size=batch_size,epochs=epochs,val_split = 0.1)

#### Sample Prediction:

In [None]:
def get_id_mappings(ids):
    return {str(i[1]):i[0] for i in ids.items()}

def generate_sample(x,y,model):
    idx = random.randint(0,len(x))
    sample = x[idx]
    label = np.argmax(y[idx],axis=1)

    p = model.predict(sample.reshape(1,-1))
    p = np.argmax(p,axis=-1)
    print("{:25} {:20}: {:10}".format("Word", "True", "Pred"))
    print("-"*50)
    for i in range(len(sample)):
        word = str(sample[i])
        pred = str(p[0][i])
        true_val = str(label[i])
        id_to_words = get_id_mappings(word_ids)
        id_to_tags = get_id_mappings(tag_ids)
        print(f"{id_to_words[word]:25}{id_to_tags[true_val]:20}{id_to_tags[pred]}")
    return

In [None]:
generate_sample(x,y,model)

The model gives reasonable predictions that almost always make sense intuitively.

### 8. Evaluating Model Performance

In [None]:
def transform_ids_to_tags(preds,tag_ids):
    id_to_tags = get_id_mappings(tag_ids)

    tag_seqs = []
    for seq in preds:
        tag_seqs.append([id_to_tags[str(i)] for i in seq])
    return tag_seqs

def get_real_labels(model,x_test,y_test,tag_ids):
    test_preds = np.argmax(model.predict(x_test),axis=-1)
    true_vals = np.argmax(y_test,axis=-1)
    test_preds = transform_ids_to_tags(test_preds,tag_ids)
    true_vals = transform_ids_to_tags(true_vals,tag_ids)
    return true_vals,test_preds

In [None]:
true_vals,test_preds = get_real_labels(model,x_test,y_test,tag_ids)
report = classification_report(true_vals,test_preds)
print(report)

The model still struggles with rarer classes such as frequency, age and gender but does very well determining tags sycg as DRUG and LOCATION.

In [None]:
model_f1 = f1_score(true_vals,test_preds)
print("F1-Score:",model_f1)

### 10. Saving Model Results

In order to track progression its good to document each model iteration as well as keep note of important changes in the model.

In [None]:
n_samples = len(x)
model_desc = f"BiLSTM-Word2Vec-EmbedSize-{embedding_size}"
results_file = "./nlp_data/model_results.csv"
note = '''Used Custom Word2Vec Embeddings of entire Discharge Summary Corpus'''
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{n_samples},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [None]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

In [None]:
results_df = pd.read_csv(results_file)
results_df.tail(5)

---