# Annotation Pipeline for DataTurks .tsv

In [None]:
%reset -f

In [None]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Padding Sequences
4. Mapping to Integer Ids
5. Formatting Data for Keras LSTM Model
6. Train Test Split
7. Specifying Model and Model Parameters
8. Training Model
9. Evaluating Model
10. Saving Model Results

### 1. Reading and Formatting File:

The file being used is the raw output of a data turks annotated tsv file.

More info Available:
https://dataturks.com/features/document-ner-annotation.php

In [3]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [4]:
file = "./nlp_data/Medical NER Dataset 2600.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['0,"HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['O2', 'DRUG'],
 [',', 'O'],
 ['who', 'O']]

Some words (such as the first above) contained a number and quote before it so these are removed with the following function.

In [5]:
def clean_words(word_ents):
    '''removes quote and comma characters from'''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0].lower()
        if word.find(',') > 0:
            word = word[word.find(',')+1:]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [6]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['o2', 'DRUG'],
 [',', 'O'],
 ['who', 'O']]

Dataturks uses a blank line to seperate each sequence. This is why most csv/tsv readers cannot read the file. The following function will split each sequence when it finds a blank line in the tsv file.

In [7]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [8]:
seqs = create_seqs(new_ents)
seqs[:1]

[[['history', 'O'],
  ['of', 'O'],
  ['present', 'O'],
  ['illness', 'O'],
  ['this', 'O'],
  ['is', 'O'],
  ['an', 'O'],
  ['81-year-old', 'CONDITION/SYMPTOM'],
  ['female', 'CONDITION/SYMPTOM'],
  ['with', 'O'],
  ['a', 'O'],
  ['history', 'O'],
  ['of', 'O'],
  ['emphysema', 'CONDITION/SYMPTOM'],
  ['not', 'O'],
  ['on', 'O'],
  ['home', 'DRUG'],
  ['o2', 'DRUG'],
  [',', 'O'],
  ['who', 'O'],
  ['presents', 'O'],
  ['with', 'O'],
  ['three', 'AMOUNT'],
  ['days', 'AMOUNT'],
  ['of', 'O'],
  ['shortness', 'CONDITION/SYMPTOM'],
  ['of', 'CONDITION/SYMPTOM'],
  ['breath', 'CONDITION/SYMPTOM'],
  ['thought', 'O'],
  ['by', 'O'],
  ['her', 'O'],
  ['primary', 'O'],
  ['care', 'O'],
  ['doctor', 'O'],
  ['to', 'O'],
  ['be', 'O'],
  ['a', 'O'],
  ['copd', 'CONDITION/SYMPTOM'],
  ['flare', 'CONDITION/SYMPTOM']]]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

This scheme adds more context to the tags and allows annotations to make more sense.

In [9]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [10]:
cleaned_tag_seqs = [clean_tags(ents) for ents in seqs]
cleaned_tag_seqs[0]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'B-CONDITION/SYMPTOM'],
 ['female', 'I-CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'B-CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'B-DRUG'],
 ['o2', 'I-DRUG'],
 [',', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'B-AMOUNT'],
 ['days', 'I-AMOUNT'],
 ['of', 'O'],
 ['shortness', 'B-CONDITION/SYMPTOM'],
 ['of', 'I-CONDITION/SYMPTOM'],
 ['breath', 'I-CONDITION/SYMPTOM'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'O'],
 ['care', 'O'],
 ['doctor', 'O'],
 ['to', 'O'],
 ['be', 'O'],
 ['a', 'O'],
 ['copd', 'B-CONDITION/SYMPTOM'],
 ['flare', 'I-CONDITION/SYMPTOM']]

### 3. Padding Sequences to a Specified Length

In order to be usable by the LSTM model, each sequence needs to be padded/truncated to the same length. Here 50 is chosen somewhat arbitraily but is around the 97th percentile of sequence lengths.

In [11]:
def pad_seq(seq,max_len):
    padded_seq = seq+[["<PAD>","O"]]*max_len
    return padded_seq[:max_len]
    
def pad_sequences(sequences,max_len=None):
    if max_len == None:
        max_len = max(len(seq) for seq in sequences)
    return [pad_seq(seq,max_len) for seq in sequences]

In [12]:
max_len = 50
padded_seqs = pad_sequences(cleaned_tag_seqs,max_len)
padded_seqs[0]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'B-CONDITION/SYMPTOM'],
 ['female', 'I-CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'B-CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'B-DRUG'],
 ['o2', 'I-DRUG'],
 [',', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'B-AMOUNT'],
 ['days', 'I-AMOUNT'],
 ['of', 'O'],
 ['shortness', 'B-CONDITION/SYMPTOM'],
 ['of', 'I-CONDITION/SYMPTOM'],
 ['breath', 'I-CONDITION/SYMPTOM'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'O'],
 ['care', 'O'],
 ['doctor', 'O'],
 ['to', 'O'],
 ['be', 'O'],
 ['a', 'O'],
 ['copd', 'B-CONDITION/SYMPTOM'],
 ['flare', 'I-CONDITION/SYMPTOM'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O']]

### 4. Mapping Words to Integer Values for Model Training

The model can't use words so each one is mapped to a particular index.

In [13]:
def get_word_ids(sentances,tag=False):
    words = []
    for sentance in sentances:
        words += list([word[tag] for word in sentance])
    word_dict = {word:i for i,word in enumerate(set(words))}
    return word_dict

In [14]:
word_ids = get_word_ids(padded_seqs)
tag_ids = get_word_ids(padded_seqs,tag=True)
print(list(word_ids.items())[:4])
print(list(tag_ids.items())[:4])

[('developing', 0), ('postprandially', 1), ('reporting', 2), ('headache', 3)]
[('I-CONDITION/SYMPTOM', 0), ('B-CONDITION/SYMPTOM', 1), ('B-LOCATION', 2), ('I-DRUG', 3)]


In [15]:
def words_to_ids(sentances,word_ids,tag_ids):
    vector = []
    for sentance in sentances:
        vector.append(list([[word_ids[w[0]],tag_ids[w[1]]] for w in sentance]))
    return np.array(vector)

Now the words are given a numeric representation which can be mapped back to the original words.

In [16]:
vectors = words_to_ids(padded_seqs,word_ids,tag_ids)
print(vectors[0][:4])
print('')
print("Word Representation:")
print(padded_seqs[0][:4])

[[2354   17]
 [2216   17]
 [5080   17]
 [3238   17]]

Word Representation:
[['history', 'O'], ['of', 'O'], ['present', 'O'], ['illness', 'O']]


Now we can label our features (x) and labels (y) for training.

In [17]:
def create_x_y(matrix,n_tags):
    x = []
    y = []
    for sequences in matrix:
        xi = [i[0] for i in sequences]
        yi = [i[1] for i in sequences]
        x.append(xi)
        y.append(yi)
    y = np.array([to_categorical(i,n_tags) for i in y])
    return np.array(x),np.array(y)

In [18]:
n_tags = len(tag_ids)
x,y = create_x_y(vectors,n_tags)
print("X-shape:",x.shape)
print(x[0][:5])
print('')
print("Y-shape:",y.shape)
print(y[0][:5])

X-shape: (2592, 50)
[2354 2216 5080 3238  964]

Y-shape: (2592, 50, 23)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


### 5. Train Test Split 

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 6. Specifying Model and Model Parameters

https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/

#### Use Word2Vec Embedding Trained on Entire Corpus:

In [20]:
def create_weight_matrix(word_ids,embeddings_index):
    embedding_matrix = np.zeros((len(word_ids),100))
    count = 0
    oov_words = []
    for word,idx in word_ids.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
        else:
            oov_words.append(word)
    return embedding_matrix,oov_words    

custom_emb = Word2Vec.load("./nlp_data/word2vec.model")
embed_matrix,oov_words = create_weight_matrix(word_ids,custom_emb);

  
  import sys


In [21]:
# OOV Words
print(f"Percent OOV: {len(oov_words)/len(word_ids)*100}%")
print(oov_words[:30])

Percent OOV: 0.9389671361502347%
['ctaxol', '181/64', ',600', 'bp81/20', '186/124', '?pancreatitis', 'hyperlipidema', '3996', 'bp111/46', '139/88', 't95', '7.37/58/96', "100-170's", '118/54', 'diaphoeris', 'throughou', 'cp/sob/doe/f/c/n/v/brbpr/melena', '400/20/5/0.6', 'tasty', '30s-50s', 'hemeturia', '/wnl', '186/84', 'hypercacemia', 'sbp160-180s', '167/60', '78-129', '5805', 'obligate', 'infetion']


Because of how some words are split there are still some words out of vocabulary even though the Word2Vec embeddings were trained on the full  version of the same corpus. Each of these OOV words will simply have 0 weights in the matrix. The 0.9% of OOV words is a huge improvement over what was seen using Glove embeddings which saw 13% of the vocab being OOV words. 

In [22]:
def create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_weights):
    model = Sequential()
    model.add(Embedding(n_words,
                        embedding_size,
                        weights=[embed_weights],
                        trainable=False,
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)))
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
    return model

In [23]:
embedding_size = 100
n_words = len(word_ids)
n_tags = len(tag_ids)

model = create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_matrix)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           532500    
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           160800    
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 23)            4623      
Total params: 697,923
Trainable params: 165,423
Non-trainable params: 532,500
_________________________________________________________________


### 7. Training Model

In [24]:
def train_model(model,x_train,y_train,batch_size=32,epochs=20,val_split = 0.1):

    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.0001,
                               patience=3,
                               mode='min',
                               verbose=1)
    
    
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(x_train, y_train, 
                        batch_size=32, 
                        epochs=epochs, 
                        validation_split=val_split, 
                        verbose=1,
                        callbacks=[early_stop]
                       )
    return history

In [25]:
epochs = 50
batch_size = 32
history = train_model(model,x_train,y_train,batch_size=batch_size,epochs=epochs,val_split = 0.1)

Train on 2098 samples, validate on 234 samples
Epoch 1/50
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping


#### Sample Prediction:

In [26]:
def get_id_mappings(ids):
    return {str(i[1]):i[0] for i in ids.items()}

def generate_sample(x,y,model):
    idx = random.randint(0,len(x))
    sample = x[idx]
    label = np.argmax(y[idx],axis=1)

    p = model.predict(sample.reshape(1,-1))
    p = np.argmax(p,axis=-1)
    print("{:25} {:20}: {:10}".format("Word", "True", "Pred"))
    print("-"*50)
    for i in range(len(sample)):
        word = str(sample[i])
        pred = str(p[0][i])
        true_val = str(label[i])
        id_to_words = get_id_mappings(word_ids)
        id_to_tags = get_id_mappings(tag_ids)
        print(f"{id_to_words[word]:25}{id_to_tags[true_val]:20}{id_to_tags[pred]}")
    return

In [27]:
generate_sample(x,y,model)

Word                      True                : Pred      
--------------------------------------------------
he                       O                   O
states                   O                   O
that                     O                   O
he                       O                   O
is                       O                   O
asymptomatic             B-CONDITION/SYMPTOM B-CONDITION/SYMPTOM
with                     I-CONDITION/SYMPTOM I-CONDITION/SYMPTOM
these                    I-CONDITION/SYMPTOM I-CONDITION/SYMPTOM
sugars                   I-CONDITION/SYMPTOM I-CONDITION/SYMPTOM
,                        O                   O
but                      O                   O
his                      O                   O
wife                     O                   O
says                     O                   O
he's                     O                   O
been                     O                   O
sleepier                 B-CONDITION/SYMPTOM O
<PAD>              

The model gives reasonable predictions that almost always make sense intuitively.

### 8. Evaluating Model Performance

In [28]:
def transform_ids_to_tags(preds,tag_ids):
    id_to_tags = get_id_mappings(tag_ids)

    tag_seqs = []
    for seq in preds:
        tag_seqs.append([id_to_tags[str(i)] for i in seq])
    return tag_seqs

def get_real_labels(model,x_test,y_test,tag_ids):
    test_preds = np.argmax(model.predict(x_test),axis=-1)
    true_vals = np.argmax(y_test,axis=-1)
    test_preds = transform_ids_to_tags(test_preds,tag_ids)
    true_vals = transform_ids_to_tags(true_vals,tag_ids)
    return true_vals,test_preds

In [29]:
true_vals,test_preds = get_real_labels(model,x_test,y_test,tag_ids)
report = classification_report(true_vals,test_preds)
print(report)

                   precision    recall  f1-score   support

CONDITION/SYMPTOM       0.57      0.64      0.61       348
         LOCATION       0.72      0.81      0.76       101
      MEASUREMENT       0.60      0.69      0.64        96
             TIME       0.49      0.54      0.51        37
           AMOUNT       0.68      0.62      0.65        68
            EVENT       0.45      0.55      0.49       119
             DRUG       0.79      0.80      0.80       100
        FREQUENCY       0.40      0.17      0.24        12
           GENDER       0.50      0.36      0.42        11
              AGE       0.46      0.40      0.43        15
     ORGANIZATION       0.57      0.62      0.59        13

        micro avg       0.60      0.65      0.62       920
        macro avg       0.60      0.65      0.62       920



The model still struggles with rarer classes such as frequency, age and gender but does very well determining tags sycg as DRUG and LOCATION.

In [30]:
model_f1 = f1_score(true_vals,test_preds)
print("F1-Score:",model_f1)

F1-Score: 0.6229166666666667


### 10. Saving Model Results

In order to track progression its good to document each model iteration as well as keep note of important changes in the model.

In [31]:
n_samples = len(x)
model_desc = f"BiLSTM-Word2Vec-EmbedSize-{embedding_size}"
results_file = "./nlp_data/model_results.csv"
note = '''Used Custom Word2Vec Embeddings of entire Discharge Summary Corpus'''
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{n_samples},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [32]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

~~~Results Successfully Saved


In [33]:
results_df = pd.read_csv(results_file)
results_df.tail(5)

Unnamed: 0,F1-Score,N-Samples,Model Type,Date,Note
11,0.56781,2592.0,BiLSTM-Glove-EmbedSize-100,Thu Nov 7 11:13:40 2019,note
12,0.56781,2592.0,BiLSTM-Glove-EmbedSize-100,Thu Nov 7 11:13:48 2019,Split Special Characters to reduce OOV Glove E...
13,0.606472,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:30:13 2019,Reduced Max Sequence Length to 43 (95th percen...
14,0.606472,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:30:51 2019,Used Custom Word2Vec Embeddings of entire Disc...
15,0.622917,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:46:03 2019,Used Custom Word2Vec Embeddings of entire Disc...


---