# Training a BiLSTM for NER 

## Using an Annotated DataTurks .tsv File

---

In [1]:
%reset -f

In [2]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Padding Sequences
4. Mapping to Integer Ids
5. Train Test Split
6. Specifying Model and Model Parameters
7. Training Model
8. Evaluating Model Performance
9. Saving Results

### 1. Reading and Formatting File:

In [3]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [4]:
file = "./nlp_data/Medical NER Dataset 2600.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['0,"HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['O2', 'DRUG'],
 [',', 'O'],
 ['who', 'O']]

In [5]:
def clean_words(word_ents):
    '''removes quote and comma characters from '''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0].lower()
        if word.find(',') > 0:
            word = word[word.find(',')+1:]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [6]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['history', 'O'],
 ['of', 'O'],
 ['present', 'O'],
 ['illness', 'O'],
 ['this', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['o2', 'DRUG'],
 [',', 'O'],
 ['who', 'O']]

In [7]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>1:
                seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [8]:
seqs = create_seqs(new_ents)
seqs[:1]

[[['history', 'O'],
  ['of', 'O'],
  ['present', 'O'],
  ['illness', 'O'],
  ['this', 'O'],
  ['is', 'O'],
  ['an', 'O'],
  ['81-year-old', 'CONDITION/SYMPTOM'],
  ['female', 'CONDITION/SYMPTOM'],
  ['with', 'O'],
  ['history', 'O'],
  ['of', 'O'],
  ['emphysema', 'CONDITION/SYMPTOM'],
  ['not', 'O'],
  ['on', 'O'],
  ['home', 'DRUG'],
  ['o2', 'DRUG'],
  ['who', 'O'],
  ['presents', 'O'],
  ['with', 'O'],
  ['three', 'AMOUNT'],
  ['days', 'AMOUNT'],
  ['of', 'O'],
  ['shortness', 'CONDITION/SYMPTOM'],
  ['of', 'CONDITION/SYMPTOM'],
  ['breath', 'CONDITION/SYMPTOM'],
  ['thought', 'O'],
  ['by', 'O'],
  ['her', 'O'],
  ['primary', 'O'],
  ['care', 'O'],
  ['doctor', 'O'],
  ['to', 'O'],
  ['be', 'O'],
  ['copd', 'CONDITION/SYMPTOM'],
  ['flare', 'CONDITION/SYMPTOM']]]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

In [9]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [10]:
cleaned_tag_seqs = [clean_tags(ents) for ents in seqs]
cleaned_tag_seqs[3]

[['in', 'O'],
 ['the', 'O'],
 ['hospital3', 'O'],
 ['emergency', 'O'],
 ['her', 'O'],
 ['oxygen', 'B-MEASUREMENT'],
 ['saturation', 'I-MEASUREMENT'],
 ['was', 'I-MEASUREMENT'],
 ['100%', 'I-MEASUREMENT'],
 ['on', 'O'],
 ['cpap', 'O']]

### 3. Padding Sequences to a Specified Length

In [11]:
def pad_seq(seq,max_len):
    padded_seq = seq+[["<PAD>","O"]]*max_len
    return padded_seq[:max_len]
    
def pad_sequences(sequences,max_len=None):
    if max_len == None:
        max_len = max(len(seq) for seq in sequences)
    return [pad_seq(seq,max_len) for seq in sequences]

In [12]:
padded_seqs = pad_sequences(cleaned_tag_seqs,max_len=50)
padded_seqs[1]

[['two', 'B-TIME'],
 ['days', 'I-TIME'],
 ['prior', 'I-TIME'],
 ['to', 'I-TIME'],
 ['admission', 'I-TIME'],
 ['she', 'O'],
 ['was', 'O'],
 ['started', 'O'],
 ['on', 'O'],
 ['prednisone', 'B-DRUG'],
 ['taper', 'O'],
 ['and', 'O'],
 ['one', 'B-TIME'],
 ['day', 'I-TIME'],
 ['prior', 'I-TIME'],
 ['to', 'I-TIME'],
 ['admission', 'I-TIME'],
 ['she', 'O'],
 ['required', 'O'],
 ['oxygen', 'B-DRUG'],
 ['at', 'O'],
 ['home', 'O'],
 ['in', 'O'],
 ['order', 'O'],
 ['to', 'O'],
 ['maintain', 'O'],
 ['oxygen', 'B-MEASUREMENT'],
 ['saturation', 'I-MEASUREMENT'],
 ['greater', 'I-MEASUREMENT'],
 ['than', 'I-MEASUREMENT'],
 ['90%', 'I-MEASUREMENT'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O'],
 ['<PAD>', 'O']]

### 4. Mapping Words to Integer Values for Model Training

In [13]:
def get_word_ids(sentances,tag=False):
    words = []
    for sentance in sentances:
        words += list([word[tag] for word in sentance])
    word_dict = {word:i for i,word in enumerate(set(words))}
    return word_dict

In [14]:
word_ids = get_word_ids(padded_seqs)
tag_ids = get_word_ids(padded_seqs,tag=True)
print(list(word_ids.items())[:4])
print(list(tag_ids.items())[:4])

[('2112-6-5', 0), ('requiring', 1), ('gastric', 2), ('guaiac-positive', 3)]
[('B-EVENT', 0), ('B-DRUG', 1), ('I-TIME', 2), ('B-FREQUENCY', 3)]


In [15]:
def words_to_ids(sentances,word_ids,tag_ids):
    vector = []
    for sentance in sentances:
        vector.append(list([[word_ids[w[0]],tag_ids[w[1]]] for w in sentance]))
    return np.array(vector)

In [16]:
vectors = words_to_ids(padded_seqs,word_ids,tag_ids)
print(vectors[0][:4])
print('')
print("Word Representation:")
print(padded_seqs[0][:4])

[[5249   17]
 [4326   17]
 [4055   17]
 [5145   17]]

Word Representation:
[['history', 'O'], ['of', 'O'], ['present', 'O'], ['illness', 'O']]


In [17]:
def create_x_y(matrix,n_tags):
    x = []
    y = []
    for sequences in matrix:
        xi = [i[0] for i in sequences]
        yi = [i[1] for i in sequences]
        x.append(xi)
        y.append(yi)
    y = np.array([to_categorical(i,n_tags) for i in y])
    return np.array(x),np.array(y)

In [18]:
n_tags = len(tag_ids)
x,y = create_x_y(vectors,n_tags)
print("X-shape:",x.shape)
print(x[0][:5])
print('')
print("Y-shape:",y.shape)
print(y[0][:5])

X-shape: (2592, 50)
[5249 4326 4055 5145 2359]

Y-shape: (2592, 50, 23)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


### 5. Train Test Split 

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 6. Specifying Model and Model Parameters

https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/

In [20]:
def create_BiLSTM(n_words,n_tags,embedding_size,max_len):
    model = Sequential()
    model.add(Embedding(n_words,embedding_size,input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)))
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
    return model

In [21]:
max_len = 50
embedding_size = 100
n_words = len(word_ids)
n_tags = len(tag_ids)

model = create_BiLSTM(n_words,n_tags,embedding_size,max_len)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           530000    
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           160800    
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 23)            4623      
Total params: 695,423
Trainable params: 695,423
Non-trainable params: 0
_________________________________________________________________


### 7. Training Model

In [22]:
def train_model(model,x_train,y_train,batch_size=32,epochs=20,val_split = 0.1):

    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.0001,
                               patience=3,
                               mode='min',
                               verbose=1)
    
    
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(x_train, y_train, 
                        batch_size=32, 
                        epochs=epochs, 
                        validation_split=val_split, 
                        verbose=1,
                        callbacks=[early_stop]
                       )
    return history

In [23]:
epochs = 50
batch_size = 32
history = train_model(model,x_train,y_train,batch_size=batch_size,epochs=epochs,val_split = 0.1)

Train on 2098 samples, validate on 234 samples
Epoch 1/50
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


In [24]:
def get_id_mappings(ids):
    return {str(i[1]):i[0] for i in ids.items()}

def generate_sample(x,y,model):
    idx = random.randint(0,len(x))
    sample = x[idx]
    label = np.argmax(y[idx],axis=1)

    p = model.predict(sample.reshape(1,-1))
    p = np.argmax(p,axis=-1)
    print("{:25} {:20}: {:10}".format("Word", "True", "Pred"))
    print("-"*50)
    for i in range(len(sample)):
        word = str(sample[i])
        pred = str(p[0][i])
        true_val = str(label[i])
        id_to_words = get_id_mappings(word_ids)
        id_to_tags = get_id_mappings(tag_ids)
        print(f"{id_to_words[word]:25}{id_to_tags[true_val]:20}{id_to_tags[pred]}")
    return

In [25]:
generate_sample(x,y,model)

Word                      True                : Pred      
--------------------------------------------------
she                      O                   O
followed                 O                   O
up                       O                   O
with                     O                   O
the                      O                   O
cardiologist             B-ORGANIZATION      B-ORGANIZATION
np                       I-ORGANIZATION      B-TIME
one                      B-TIME              B-TIME
week                     I-TIME              I-TIME
later                    I-TIME              I-TIME
and                      O                   O
was                      O                   O
found                    O                   O
to                       O                   O
have                     O                   O
o2sats                   B-MEASUREMENT       B-MEASUREMENT
ranging                  I-MEASUREMENT       I-MEASUREMENT
from                     I-MEASURE

### 8. Evaluating Model Performance

In [26]:
def transform_ids_to_tags(preds,tag_ids):
    id_to_tags = get_id_mappings(tag_ids)

    tag_seqs = []
    for seq in preds:
        tag_seqs.append([id_to_tags[str(i)] for i in seq])
    return tag_seqs

def get_real_labels(model,x_test,y_test,tag_ids):
    test_preds = np.argmax(model.predict(x_test),axis=-1)
    true_vals = np.argmax(y_test,axis=-1)
    test_preds = transform_ids_to_tags(test_preds,tag_ids)
    true_vals = transform_ids_to_tags(true_vals,tag_ids)
    return true_vals,test_preds

In [27]:
true_vals,test_preds = get_real_labels(model,x_test,y_test,tag_ids)
report = classification_report(true_vals,test_preds)
print(report)

                   precision    recall  f1-score   support

CONDITION/SYMPTOM       0.46      0.43      0.44       300
             DRUG       0.65      0.55      0.59        86
         LOCATION       0.64      0.75      0.69       102
      MEASUREMENT       0.35      0.40      0.37        65
            EVENT       0.25      0.54      0.35       120
             TIME       0.28      0.36      0.31        36
     ORGANIZATION       0.57      0.31      0.40        13
           GENDER       0.62      0.89      0.73         9
           AMOUNT       0.40      0.33      0.37        63
              AGE       0.53      0.53      0.53        15
        FREQUENCY       0.00      0.00      0.00        12

        micro avg       0.42      0.48      0.45       821
        macro avg       0.45      0.48      0.46       821



In [29]:
model_f1 = f1_score(true_vals,test_preds)
print("F1-Score:",model_f1)

F1-Score: 0.4521640091116173


The model sets the baseline of NER prediction with an F1-Score of ~0.45. Next results are saved for comparison to later models such as CRFs and other word embedding methods.

### 9. Saving Model Results

In [30]:
n_samples = len(x)
model_desc = f"BiLSTM-EmbedSize-{embedding_size}"
results_file = "./nlp_data/model_results.csv"
note = "Simple BiLSTM Max Sequence Len of 50"
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{n_samples},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [31]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

~~~Results Successfully Saved


In [32]:
pd.read_csv(results_file).tail()

Unnamed: 0,F1-Score,N-Samples,Model Type,Date,Note
14,0.606472,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:30:51 2019,Used Custom Word2Vec Embeddings of entire Disc...
15,0.622917,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:46:03 2019,Used Custom Word2Vec Embeddings of entire Disc...
16,0.545845,2592.0,BiLSTM-Glove-EmbedSize-100,Thu Nov 7 14:34:22 2019,Max Len Reverted Back to 50 Words
17,0.622508,2592.0,CRF,Thu Nov 7 14:41:08 2019,Simple CRF model
18,0.452164,2592.0,BiLSTM-EmbedSize-100,Thu Nov 7 15:01:08 2019,Simple BiLSTM Max Sequence Len of 50


---