# Annotation Pipeline for DataTurks .tsv

In [1]:
%reset -f

In [2]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Padding Sequences
4. Mapping to Integer Ids
5. Formatting Data for Keras LSTM Model
6. Train Test Split
7. Specifying Model and Model Parameters
8. Training Model
9. Evaluating Model

### 1. Reading and Formatting File:

In [3]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [4]:
file = "./nlp_data/Medical NER Dataset 2600.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['0,"HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['O2', 'DRUG'],
 [',', 'O'],
 ['who', 'O']]

In [5]:
def clean_words(word_ents):
    '''removes quote and comma characters from '''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0].lower()
        if word.find(',') > 0:
            word = word[word.find(',')+1:]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [6]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['o2', 'DRUG'],
 [',', 'O'],
 ['who', 'O']]

In [7]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [8]:
seqs = create_seqs(new_ents)
seqs[:1]

[[['history', 'O'],
  ['of', 'O'],
  ['present', 'O'],
  ['illness', 'O'],
  ['this', 'O'],
  ['is', 'O'],
  ['an', 'O'],
  ['81-year-old', 'CONDITION/SYMPTOM'],
  ['female', 'CONDITION/SYMPTOM'],
  ['with', 'O'],
  ['a', 'O'],
  ['history', 'O'],
  ['of', 'O'],
  ['emphysema', 'CONDITION/SYMPTOM'],
  ['not', 'O'],
  ['on', 'O'],
  ['home', 'DRUG'],
  ['o2', 'DRUG'],
  [',', 'O'],
  ['who', 'O'],
  ['presents', 'O'],
  ['with', 'O'],
  ['three', 'AMOUNT'],
  ['days', 'AMOUNT'],
  ['of', 'O'],
  ['shortness', 'CONDITION/SYMPTOM'],
  ['of', 'CONDITION/SYMPTOM'],
  ['breath', 'CONDITION/SYMPTOM'],
  ['thought', 'O'],
  ['by', 'O'],
  ['her', 'O'],
  ['primary', 'O'],
  ['care', 'O'],
  ['doctor', 'O'],
  ['to', 'O'],
  ['be', 'O'],
  ['a', 'O'],
  ['copd', 'CONDITION/SYMPTOM'],
  ['flare', 'CONDITION/SYMPTOM']]]

Here we add an extra step to split on special characters to reduce the number of OOV words for our Glove Embeddings.

In [9]:
def expand_word(ent):
    words = re.split('(\W)',ent[0])
    return [[i,ent[1]] for i in words ]

In [10]:
def expand_special_chars(seqs):
    new_seqs = []
    for seq in seqs:
        new_seq = []
        for word in seq:
            new_seq += expand_word(word)
        new_seqs.append(new_seq)
    return new_seqs

In [11]:
ex_seqs = expand_special_chars(seqs)
ex_seqs[0]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81', 'CONDITION/SYMPTOM'],
 ['-', 'CONDITION/SYMPTOM'],
 ['year', 'CONDITION/SYMPTOM'],
 ['-', 'CONDITION/SYMPTOM'],
 ['old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['o2', 'DRUG'],
 ['', 'O'],
 [',', 'O'],
 ['', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'AMOUNT'],
 ['days', 'AMOUNT'],
 ['of', 'O'],
 ['shortness', 'CONDITION/SYMPTOM'],
 ['of', 'CONDITION/SYMPTOM'],
 ['breath', 'CONDITION/SYMPTOM'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'O'],
 ['care', 'O'],
 ['doctor', 'O'],
 ['to', 'O'],
 ['be', 'O'],
 ['a', 'O'],
 ['copd', 'CONDITION/SYMPTOM'],
 ['flare', 'CONDITION/SYMPTOM']]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

In [12]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [13]:
cleaned_tag_seqs = [clean_tags(ents) for ents in ex_seqs]
cleaned_tag_seqs[0]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81', 'B-CONDITION/SYMPTOM'],
 ['-', 'I-CONDITION/SYMPTOM'],
 ['year', 'I-CONDITION/SYMPTOM'],
 ['-', 'I-CONDITION/SYMPTOM'],
 ['old', 'I-CONDITION/SYMPTOM'],
 ['female', 'I-CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'B-CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'B-DRUG'],
 ['o2', 'I-DRUG'],
 ['', 'O'],
 [',', 'O'],
 ['', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'B-AMOUNT'],
 ['days', 'I-AMOUNT'],
 ['of', 'O'],
 ['shortness', 'B-CONDITION/SYMPTOM'],
 ['of', 'I-CONDITION/SYMPTOM'],
 ['breath', 'I-CONDITION/SYMPTOM'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'O'],
 ['care', 'O'],
 ['doctor', 'O'],
 ['to', 'O'],
 ['be', 'O'],
 ['a', 'O'],
 ['copd', 'B-CONDITION/SYMPTOM'],
 ['flare', 'I-CONDITION/SYMPTOM']]

### 3. Padding Sequences to a Specified Length

In [14]:
def pad_seq(seq,max_len):
    padded_seq = seq+[["<PAD>","O"]]*max_len
    return padded_seq[:max_len]
    
def pad_sequences(sequences,max_len=None):
    if max_len == None:
        max_len = max(len(seq) for seq in sequences)
    return [pad_seq(seq,max_len) for seq in sequences]

In [15]:
max_len = 50
padded_seqs = pad_sequences(cleaned_tag_seqs,max_len)
padded_seqs[0]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81', 'B-CONDITION/SYMPTOM'],
 ['-', 'I-CONDITION/SYMPTOM'],
 ['year', 'I-CONDITION/SYMPTOM'],
 ['-', 'I-CONDITION/SYMPTOM'],
 ['old', 'I-CONDITION/SYMPTOM'],
 ['female', 'I-CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'B-CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'B-DRUG'],
 ['o2', 'I-DRUG'],
 ['', 'O'],
 [',', 'O'],
 ['', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'B-AMOUNT'],
 ['days', 'I-AMOUNT'],
 ['of', 'O'],
 ['shortness', 'B-CONDITION/SYMPTOM'],
 ['of', 'I-CONDITION/SYMPTOM'],
 ['breath', 'I-CONDITION/SYMPTOM'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'O'],
 ['care', 'O'],
 ['doctor', 'O'],
 ['to', 'O'],
 ['be', 'O'],
 ['a', 'O'],
 ['copd', 'B-CONDITION/SYMPTOM'],
 ['flare', 'I-CONDITION/SYMPTOM'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],


### 4. Mapping Words to Integer Values for Model Training

In [16]:
def get_word_ids(sentances,tag=False):
    words = []
    for sentance in sentances:
        words += list([word[tag] for word in sentance])
    word_dict = {word:i for i,word in enumerate(set(words))}
    return word_dict

In [17]:
word_ids = get_word_ids(padded_seqs)
tag_ids = get_word_ids(padded_seqs,tag=True)
print(list(word_ids.items())[:4])
print(list(tag_ids.items())[:4])

[('', 0), ('here', 1), ('10', 2), ('hospital6', 3)]
[('I-DATE', 0), ('B-DRUG', 1), ('B-GENDER', 2), ('I-LOCATION', 3)]


In [18]:
def words_to_ids(sentances,word_ids,tag_ids):
    vector = []
    for sentance in sentances:
        vector.append(list([[word_ids[w[0]],tag_ids[w[1]]] for w in sentance]))
    return np.array(vector)

In [19]:
vectors = words_to_ids(padded_seqs,word_ids,tag_ids)
print(vectors[0][:4])
print('')
print("Word Representation:")
print(padded_seqs[0][:4])

[[3445   12]
 [2251   12]
 [1680   12]
 [2038   12]]

Word Representation:
[['history', 'O'], ['of', 'O'], ['present', 'O'], ['illness', 'O']]


In [20]:
def create_x_y(matrix,n_tags):
    x = []
    y = []
    for sequences in matrix:
        xi = [i[0] for i in sequences]
        yi = [i[1] for i in sequences]
        x.append(xi)
        y.append(yi)
    y = np.array([to_categorical(i,n_tags) for i in y])
    return np.array(x),np.array(y)

In [21]:
n_tags = len(tag_ids)
x,y = create_x_y(vectors,n_tags)
print("X-shape:",x.shape)
print(x[0][:5])
print('')
print("Y-shape:",y.shape)
print(y[0][:5])

X-shape: (2592, 50)
[3445 2251 1680 2038 4491]

Y-shape: (2592, 50, 24)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


### 5. Train Test Split 

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 6. Specifying Model and Model Parameters

https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/

#### Use Pre-Trained Glove Word Embeddings

GLOVE Code:
https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa

Embeddings Available:
https://nlp.stanford.edu/projects/glove/

In [23]:
def load_embeddings(filepath):
    with open(f"{filepath}glove.6B.100d.txt") as f:
        embeddings_index={}
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.array(values[1:],dtype="float32")
            embeddings_index[word] = coefs
    return embeddings_index

def create_weight_matrix(word_ids,embeddings_index):
    embedding_matrix = np.zeros((len(word_ids),100))
    count = 0
    oov_words = []
    for word,idx in word_ids.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            oov_words.append(word)
    return embedding_matrix,oov_words    

glove_path = "./glove.6B/"
glove_emb = load_embeddings(glove_path)
embed_matrix,oov_words = create_weight_matrix(word_ids,glove_emb)

In [24]:
# OOV Words
print(f"Percent OOV: {len(oov_words)/len(word_ids)*100}%")
print(oov_words[:30])

Percent OOV: 13.945729537366546%
['', 'hospital6', 'transluminal', 'hypernatremic', 'precipitant', 't99', 'stented', 'pneumoperitoneum', 'fsgs', 'rhabdo', 'endorces', 'esophagogastroduodenoscopy', 'dyuria', 'azitrhomycin', 'cefepine', 'recived', 'neseritide', 'nonhodgkin', 'hypervolemia', 'hypersensitivty', 'bolused', 'dysuria', '1hour', 'atrius', 'stridor', 'valvuloplasty', 'transesophageal', '68yo', '2173', 'apneas']


Almost 14% of the words are OOV. This high number does make some sense as so much of the medical text contains highly specialized words such as drug names, measurements and abreviations.

In [25]:
def create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_weights):
    model = Sequential()
    model.add(Embedding(n_words,
                        embedding_size,
                        weights=[embed_weights],
                        trainable=False,
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)))
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
    return model

In [26]:
embedding_size = 100
n_words = len(word_ids)
n_tags = len(tag_ids)

model = create_BiLSTM(n_words,n_tags,embedding_size,max_len,embed_matrix)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           449600    
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           160800    
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 24)            4824      
Total params: 615,224
Trainable params: 165,624
Non-trainable params: 449,600
_________________________________________________________________


### 7. Training Model

In [27]:
def train_model(model,x_train,y_train,batch_size=32,epochs=20,val_split = 0.1):

    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.0001,
                               patience=3,
                               mode='min',
                               verbose=1)
    
    
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(x_train, y_train, 
                        batch_size=32, 
                        epochs=epochs, 
                        validation_split=val_split, 
                        verbose=1,
                        callbacks=[early_stop]
                       )
    return history

In [None]:
epochs = 50
batch_size = 32
history = train_model(model,x_train,y_train,batch_size=batch_size,epochs=epochs,val_split = 0.1)

Train on 2098 samples, validate on 234 samples
Epoch 1/50
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50

In [None]:
def get_id_mappings(ids):
    return {str(i[1]):i[0] for i in ids.items()}

def generate_sample(x,y,model):
    idx = random.randint(0,len(x))
    sample = x[idx]
    label = np.argmax(y[idx],axis=1)

    p = model.predict(sample.reshape(1,-1))
    p = np.argmax(p,axis=-1)
    print("{:25} {:20}: {:10}".format("Word", "True", "Pred"))
    print("-"*50)
    for i in range(len(sample)):
        word = str(sample[i])
        pred = str(p[0][i])
        true_val = str(label[i])
        id_to_words = get_id_mappings(word_ids)
        id_to_tags = get_id_mappings(tag_ids)
        print(f"{id_to_words[word]:25}{id_to_tags[true_val]:20}{id_to_tags[pred]}")
    return

In [None]:
generate_sample(x,y,model)

### 8. Evaluating Model Performance

In [None]:
def transform_ids_to_tags(preds,tag_ids):
    id_to_tags = get_id_mappings(tag_ids)

    tag_seqs = []
    for seq in preds:
        tag_seqs.append([id_to_tags[str(i)] for i in seq])
    return tag_seqs

def get_real_labels(model,x_test,y_test,tag_ids):
    test_preds = np.argmax(model.predict(x_test),axis=-1)
    true_vals = np.argmax(y_test,axis=-1)
    test_preds = transform_ids_to_tags(test_preds,tag_ids)
    true_vals = transform_ids_to_tags(true_vals,tag_ids)
    return true_vals,test_preds

In [None]:
true_vals,test_preds = get_real_labels(model,x_test,y_test,tag_ids)
report = classification_report(true_vals,test_preds)
print(f1_score(true_vals,test_preds))

In [None]:
model_f1 = f1_score(true_vals,test_preds)

n_samples = len(x)
model_desc = f"BiLSTM-Glove-EmbedSize-{embedding_size}"
results_file = "./nlp_data/model_results.csv"
note = '''Max Len Reverted Back to 50 Words'''
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{n_samples},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [None]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

In [None]:
pd.read_csv(results_file).tail()

---