# Annotation Pipeline for DataTurks .tsv

In [300]:
%reset -f

In [301]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Padding Sequences
4. Mapping to Integer Ids
5. Formatting Data for Keras LSTM Model
6. Train Test Split
7. Specifying Model and Model Parameters
8. Training Model
9. Evaluating Model
10. Saving Model Results

### 1. Reading and Formatting File:

The file being used is the raw output of a data turks annotated tsv file.

More info Available:
https://dataturks.com/features/document-ner-annotation.php

In [302]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [303]:
file = "./data/Medical NER V2 2000.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['"In', 'O'],
 ['the', 'O'],
 ['Hospital3', 'GEO'],
 ['Emergency', 'GEO'],
 ['Room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['CPAP', 'O2 Saturation'],
 ['"', 'O'],
 [''],
 ['She', 'O'],
 ['was', 'O'],
 ['not', 'DOS'],
 ['able', 'DOS'],
 ['to', 'DOS']]

Some words (such as the first above) contained a number and quote before it so these are removed with the following function.

In [304]:
def clean_words(word_ents):
    '''removes quote and comma characters from'''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0].lower()
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [305]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'GEO'],
 ['emergency', 'GEO'],
 ['room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['cpap', 'O2 Saturation'],
 ['', 'O'],
 [''],
 ['she', 'O'],
 ['was', 'O'],
 ['not', 'DOS'],
 ['able', 'DOS'],
 ['to', 'DOS']]

Dataturks uses a blank line to seperate each sequence. This is why most csv/tsv readers cannot read the file. The following function will split each sequence when it finds a blank line in the tsv file.

In [306]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                if ents[0][-1] == ".":
                    seq.append([ents[0][:-1],ents[1]])
                else:
                    seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [307]:
seqs = create_seqs(new_ents)
seqs[:1]

[[['in', 'O'],
  ['the', 'O'],
  ['hospital3', 'GEO'],
  ['emergency', 'GEO'],
  ['room', 'GEO'],
  [',', 'O'],
  ['her', 'O'],
  ['oxygen', 'O'],
  ['saturation', 'O'],
  ['was', 'O'],
  ['100%', 'O2 Saturation'],
  ['on', 'O2 Saturation'],
  ['cpap', 'O2 Saturation']]]

### Expand Special Characters

In [308]:
def expand_word(ent):
    '''Splits at specified special characters keeping special characters as their own value'''
    words = re.split('([/\-\%><])',ent[0])
    return [[i,ent[1]] for i in words if len(i)>0]

def expand_special_chars(seqs):
    '''Expands special characters in words into seperate words while '''
    new_seqs = []
    for seq in seqs:
        new_seq = []
        for word in seq:
            new_seq += expand_word(word)
        new_seqs.append(new_seq)
    return new_seqs

In [309]:
ex_seqs = expand_special_chars(seqs)
ex_seqs[0]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'GEO'],
 ['emergency', 'GEO'],
 ['room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100', 'O2 Saturation'],
 ['%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['cpap', 'O2 Saturation']]

In [310]:
def encode_numerics(seq):
    '''Add encodings for common number types'''
    enc_seq = []
    for ent in seq:
        enc = ent[0].strip()
        if re.match("^\d$",ent[0]) != None:
            enc = "<1DigitNum>"
        elif re.match("^\d\d$",ent[0]) != None:
            enc = "<2DigitNum>"
        elif re.match("^\d\d\d$",ent[0]) !=None:
            enc = "<3DigitNum>"
        elif re.match("^\d{4}$",ent[0]) != None:
            enc = "4DigitNum"
        elif re.match("^\d*\.\d*$",ent[0]) != None:
            enc = "<DecimalNum>"
        elif re.match("^\d+,\d+$",ent[0]) != None:
            enc = "<CommaNum>"
        elif re.match("^\d+'?s$",ent[0]) !=None:
            enc = "<RangeNum>"
            
        enc_seq.append([enc,ent[1]])
    return enc_seq

In [311]:
enc_seqs = [encode_numerics(seq) for seq in ex_seqs]
enc_seqs[0]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'GEO'],
 ['emergency', 'GEO'],
 ['room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['<3DigitNum>', 'O2 Saturation'],
 ['%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['cpap', 'O2 Saturation']]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

This scheme adds more context to the tags and allows annotations to make more sense.

In [312]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [313]:
cleaned_tag_seqs = [clean_tags(ents) for ents in enc_seqs]
cleaned_tag_seqs[0]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'B-GEO'],
 ['emergency', 'I-GEO'],
 ['room', 'I-GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['<3DigitNum>', 'B-O2 Saturation'],
 ['%', 'I-O2 Saturation'],
 ['on', 'I-O2 Saturation'],
 ['cpap', 'I-O2 Saturation']]

### 3. Padding Sequences to a Specified Length

In order to be usable by the LSTM model, each sequence needs to be padded/truncated to the same length. Here 50 is chosen somewhat arbitraily but is around the 97th percentile of sequence lengths.

In [314]:
def pad_seq(seq,max_len):
    padded_seq = seq+[["<PAD>","O"]]*max_len
    return padded_seq[:max_len]
    
def pad_sequences(sequences,max_len=None):
    if max_len == None:
        max_len = max(len(seq) for seq in sequences)
    return [pad_seq(seq,max_len) for seq in sequences]

In [315]:
max_len = 50
padded_seqs = pad_sequences(enc_seqs,max_len)
padded_seqs[0]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'GEO'],
 ['emergency', 'GEO'],
 ['room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['<3DigitNum>', 'O2 Saturation'],
 ['%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['cpap', 'O2 Saturation'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O']]

### 4. Mapping Words to Integer Values for Model Training

The model can't use words so each one is mapped to a particular index.

In [316]:
def get_word_ids(sentances,tag=False):
    words = []
    for sentance in sentances:
        words += list([word[tag] for word in sentance])
    word_dict = {word:i for i,word in enumerate(set(words))}
    return word_dict

In [317]:
word_ids = get_word_ids(padded_seqs)
tag_ids = get_word_ids(padded_seqs,tag=True)
print(list(word_ids.items())[:4])
print(list(tag_ids.items())[:4])

[('bilious', 0), ('apical', 1), ('embolism', 2), ('suggested', 3)]
[('Date', 0), ('Temperature', 1), ('Drug', 2), ('Condition', 3)]


In [318]:
def words_to_ids(sentances,word_ids,tag_ids):
    vector = []
    for sentance in sentances:
        vector.append(list([[word_ids[w[0]],tag_ids[w[1]]] for w in sentance]))
    return np.array(vector)

Now the words are given a numeric representation which can be mapped back to the original words.

In [319]:
vectors = words_to_ids(padded_seqs,word_ids,tag_ids)
print(vectors[0][:4])
print('')
print("Word Representation:")
print(padded_seqs[0][:4])

[[2485   18]
 [2448   18]
 [3605   19]
 [1476   19]]

Word Representation:
[['in', 'O'], ['the', 'O'], ['hospital3', 'GEO'], ['emergency', 'GEO']]


Now we can label our features (x) and labels (y) for training.

In [320]:
def create_x_y(matrix,n_tags):
    x = []
    y = []
    for sequences in matrix:
        xi = [i[0] for i in sequences]
        yi = [i[1] for i in sequences]
        x.append(xi)
        y.append(yi)
    y = np.array([to_categorical(i,n_tags) for i in y])
    return np.array(x),np.array(y)

In [321]:
n_tags = len(tag_ids)
x,y = create_x_y(vectors,n_tags)
print("X-shape:",x.shape)
print(x[0][:5])
print('')
print("Y-shape:",y.shape)
print(y[0][:5])

X-shape: (2009, 50)
[2485 2448 3605 1476 1109]

Y-shape: (2009, 50, 26)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0.]]


### 5. Train Test Split 

In [322]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 6. Specifying Model and Model Parameters

https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/

#### Use Word2Vec Embedding Trained on Entire Corpus:

In [323]:
def create_weight_matrix(word_ids,embeddings_index):
    embedding_matrix = np.zeros((len(word_ids),100))
    count = 0
    oov_words = []
    for word,idx in word_ids.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
        else:
            if word == "<PAD>":
                embedding_matrix[idx] = np.array([999]*100)
            else:
                oov_words.append(word)
    return embedding_matrix,oov_words    

custom_emb = Word2Vec.load("./data/word2vec_numeric_encs.model")
embed_matrix,oov_words = create_weight_matrix(word_ids,custom_emb);

  
  import sys


In [324]:
# OOV Words
print(f"Percent OOV: {len(oov_words)/len(word_ids)*100}%")
print(oov_words[:30])

Percent OOV: 0.2959309494451295%
['t95', 'hbc', 'oes', 'letharagic', 'hyperlipidema', 'echopraxia', '1130hrs', 't=101', 'bedsid', 'tasty', '4.75l', "plt's"]


Because of how some words are split there are still some words out of vocabulary even though the Word2Vec embeddings were trained on the full  version of the same corpus. Each of these OOV words will simply have 0 weights in the matrix. The 0.9% of OOV words is a huge improvement over what was seen using Glove embeddings which saw 13% of the vocab being OOV words. 

In [353]:
def create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_weights):
    model = Sequential()
    model.add(Embedding(n_words,
                        embedding_size,
                        weights=[embed_weights],
                        trainable=False,
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)))
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
    return model

In [367]:
x[0].shape[0]

50

In [365]:
def tester():
    return 1,2,3,4
a,b,c,d = tester()

In [354]:
embedding_size = 100
n_words = len(word_ids)
n_tags = len(tag_ids)

model = create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_matrix)
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 100)           405500    
_________________________________________________________________
dropout_7 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 50, 200)           160800    
_________________________________________________________________
time_distributed_7 (TimeDist (None, 50, 26)            5226      
Total params: 571,526
Trainable params: 166,026
Non-trainable params: 405,500
_________________________________________________________________


### 7. Training Model

In [341]:
def train_model(model,x_train,y_train,batch_size=32,epochs=20,val_split = 0.1):

    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.0001,
                               patience=0,
                               mode='min',
                               verbose=1)
    
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(x_train, y_train, 
                        batch_size=32, 
                        epochs=epochs, 
                        validation_split=val_split, 
                        verbose=1,
                        callbacks=[early_stop]
                       )
    return history

In [355]:
epochs = 20
batch_size = 32
val_size = 0
history = train_model(model,x_train,y_train,batch_size=batch_size,epochs=epochs,val_split = val_size)

Train on 1808 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Sample Prediction:

In [356]:
def get_id_mappings(ids):
    return {str(i[1]):i[0] for i in ids.items()}

def generate_sample(x,y,model):
    idx = random.randint(0,len(x))
    sample = x[idx]
    label = np.argmax(y[idx],axis=1)

    p = model.predict(sample.reshape(1,-1))
    p = np.argmax(p,axis=-1)
    print("{:25} {:20}: {:10}".format("Word", "True", "Pred"))
    print("-"*50)
    for i in range(len(sample)):
        word = str(sample[i])
        pred = str(p[0][i])
        true_val = str(label[i])
        id_to_words = get_id_mappings(word_ids)
        id_to_tags = get_id_mappings(tag_ids)
        print(f"{id_to_words[word]:25}{id_to_tags[true_val]:20}{id_to_tags[pred]}")
    return

In [357]:
generate_sample(x,y,model)

Word                      True                : Pred      
--------------------------------------------------
known                    O                   O
lastname                 O                   O
<3DigitNum>              O                   O
reports                  O                   O
sudden                   O                   O
onset                    O                   O
of                       O                   O
substernal               BODY                BODY
chest                    DOS                 DOS
pain                     DOS                 DOS
at                       O                   O
<1DigitNum>              Time                Time
am                       Time                Time
<PAD>                    O                   O
<PAD>                    O                   O
<PAD>                    O                   O
<PAD>                    O                   O
<PAD>                    O                   O
<PAD>                    O     

The model gives reasonable predictions that almost always make sense intuitively.

### 8. Evaluating Model Performance

In [358]:
def transform_ids_to_tags(preds,tag_ids):
    id_to_tags = get_id_mappings(tag_ids)

    tag_seqs = []
    for seq in preds:
        tag_seqs.append([id_to_tags[str(i)] for i in seq])
    return tag_seqs

def get_real_labels(model,x_test,y_test,tag_ids):
    test_preds = np.argmax(model.predict(x_test),axis=-1)
    true_vals = np.argmax(y_test,axis=-1)
    test_preds = transform_ids_to_tags(test_preds,tag_ids)
    true_vals = transform_ids_to_tags(true_vals,tag_ids)
    return true_vals,test_preds

In [359]:
true_test_vals,test_preds = get_real_labels(model,x_test,y_test,tag_ids)
true_train_vals,train_preds = get_real_labels(model,x_train,y_train,tag_ids)
test_report = classification_report(true_test_vals,test_preds)
train_report = classification_report(true_train_vals,train_preds)
print("Test Report \n","-"*60)
print(test_report)
print("Train Report \n","-"*60)
print(train_report)

Test Report 
 ------------------------------------------------------------
                    precision    recall  f1-score   support

              Time       0.43      0.36      0.39        28
               Age       1.00      0.94      0.97        17
               GEO       0.83      0.90      0.86        42
               DOS       0.43      0.52      0.47       157
          Duration       0.36      0.47      0.41        17
              Drug       0.80      0.92      0.86        49
         Condition       0.58      0.66      0.62        83
  Test / Screening       0.69      0.65      0.67        37
  Respiratory Rate       0.86      0.60      0.71        10
Patient Relocation       0.84      0.96      0.90        27
         Procedure       0.38      0.55      0.45        42
          Quantity       0.33      0.33      0.33         9
    Blood Pressure       0.75      0.94      0.83        32
       Temperature       0.83      0.71      0.77         7
              Date      

The model still struggles with rarer classes such as frequency, age and gender but does very well determining tags sycg as DRUG and LOCATION.

In [360]:
model_f1 = f1_score(true_test_vals,test_preds)
print("F1-Score:",model_f1)

F1-Score: 0.6282140375260596


### 10. Saving Model Results

In order to track progression its good to document each model iteration as well as keep note of important changes in the model.

In [361]:
n_samples = len(x)
model_desc = f"BiLSTM-Word2Vec-EmbedSize-{embedding_size}"
results_file = "./data/model_results.csv"
note = '''20 Epochs no validation'''
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{n_samples},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [362]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

~~~Results Successfully Saved


In [363]:
results_df = pd.read_csv(results_file)
results_df.tail(5)

Unnamed: 0,F1-Score,n-sample,Model Type,Date,Notes
0,0.524544,1008,BiLSTM-Word2Vec-EmbedSize-100,Tue Nov 12 13:45:02 2019,Used Custom Word2Vec Embeddings of entire Disc...
1,0.58753,1008,BiLSTM-Word2Vec-EmbedSize-100,Tue Nov 12 13:45:40 2019,Custom Word2Vec of entire corpus with numeric ...
2,0.608212,2009,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 14 13:07:01 2019,Custom Word2Vec of entire corpus with numeric ...
3,0.618051,2009,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 14 13:14:45 2019,Same as last with bug fix for slicing token co...
4,0.628214,2009,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 14 13:25:56 2019,20 Epochs no validation


---