In [None]:
%matplotlib inline

In [None]:
# Author: Qingzhou Li and Leo Zhang based on https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html by Robert Guthrie

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import numpy as np
import json
torch.manual_seed(1)
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report

Helper functions to make the code more readable.



In [None]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


## Bi-LSTM for RE (Toy sample)

Create model



In [None]:
# We will first use bi-LSTM to get contextual embeddings for each token. Then we get entity embeddings by averaging its constituting token embeddings. Then we concat
# head and tail embeddigns. We make predictions based on the concat embedding.

class BiLSTM(nn.Module):

    def __init__(self, vocab_size, relation_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.relation_to_ix = relation_to_ix
        self.relation_size = len(relation_to_ix)
        #self.char_embeds = nn.xxxxxxxx
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)  #embedding words
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)    #contextual embeddings for each tokens using bi-LSTM

        # Maps the output of the concat embedding into tag space.
        self.hidden2tag = nn.Linear(hidden_dim*2, self.relation_size)


        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))



    def cross_entropy_loss(self, sentence, head, tail, tags):
        feats = self.forward(sentence,head,tail)
        loss_fuction = nn.CrossEntropyLoss()
        loss = loss_fuction(feats, tags)
        return loss


    def forward(self, sentence, head, tail):
        # Get embeddings for each tokens
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        #now we get embeddings for head entity and tail entity by averaging embeddigns of constituting tokens.
        lstm_head = torch.mean(lstm_out[head[0]:head[1]],dim=0)
        lstm_tail = torch.mean(lstm_out[tail[0]:tail[1]],dim=0)
        #now we concat head and tail embeddings
        lstm_entities = torch.cat([lstm_head,lstm_tail])
        #then we predict relations based on concat embeddings
        lstm_feats = self.hidden2tag(lstm_entities)
        return lstm_feats



In [None]:
# Import the trainning data
with open("relation.txt", "r") as fp:
  relation_data = json.load(fp)

In [None]:
# Import the test data
with open("test_relation.txt", "r") as fp:
  test_data = json.load(fp)

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(relation_data, test_size=0.2, random_state=1234)

In [None]:
class EarlyStopping:
    def __init__(self, tolerance=5, min_delta=0):
        self.tolerance = tolerance
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.tolerance:
                return True
        return False


Run training



In [None]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 64

training_data = train
validation_data = val


word_to_ix = {}
for item in training_data:
    for word in item[1]:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
for item in validation_data:
    for word in item[1]:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
for item in test_data:
    for word in item[1]:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

relation_to_ix = {START_TAG: 0, STOP_TAG: 1,'Strength-Drug': 2, 'Form-Drug': 3, 'Route-Drug': 4, 'Frequency-Drug': 5, 'Reason-Drug': 6, 'ADE-Drug': 7,
                  'Dosage-Drug': 8, 'Duration-Drug': 9}
ix_to_relation = {0: START_TAG, 1: STOP_TAG, 2: 'Strength-Drug', 3: 'Form-Drug', 4: 'Route-Drug', 5: 'Frequency-Drug', 6: 'Reason-Drug', 7: 'ADE-Drug',
                  8: 'Dosage-Drug', 9: 'Duration-Drug'}

model = BiLSTM(len(word_to_ix), relation_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=1e-4) #you can change it to dynamic optimisers such as Adam
early_stopping = EarlyStopping(tolerance=5, min_delta=0)
train_loss = []
validation_loss = []
epoch_i = 0
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][1], word_to_ix)
    precheck_tags = torch.tensor([relation_to_ix[training_data[0][2][0][2]]], dtype=torch.float)
    print('label',precheck_tags)




label tensor([5.])


In [None]:
#Make sure prepare_sequence from earlier in the LSTM section is loaded
for i in tqdm(range(50)):
    for key, sentence, relations in training_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        #print(len(relations))
        for head, tail, rel in relations:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.

            targets = torch.tensor(relation_to_ix[rel], dtype=torch.long)

            # Step 3. Run our forward pass.
            loss = model.cross_entropy_loss(sentence_in, head, tail, targets)
            #print(loss)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()
    train_loss.append(loss.item())
    with torch.no_grad():
      for key, sentence, tags in validation_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        #print(len(relations))
        for head, tail, rel in relations:
          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          targets = torch.tensor(relation_to_ix[rel], dtype=torch.long, requires_grad=False)

          # Step 3. Run our forward pass.
          val_loss = model.cross_entropy_loss(sentence_in, head, tail, targets)
          #print(val_loss)

    validation_loss.append(val_loss.item())
    epoch_i += 1
    # early stopping
    if early_stopping.early_stop(val_loss):
      print("We are at epoch:", epoch_i)
      break
    validation_loss.append(val_loss.item())

 30%|███       | 15/50 [23:41<55:17, 94.77s/it]

We are at epoch: 16





In [None]:
# Check predictions
with torch.no_grad():
    key, sentence, relations = training_data[0]
    sentence_in = prepare_sequence(sentence, word_to_ix)
    for head, tail, _ in relations:
        param = sentence_in, head, tail
        print('prediction: ', ix_to_relation[np.argmax(model(*param).numpy())])

prediction:  Frequency-Drug
prediction:  Strength-Drug
prediction:  Form-Drug
prediction:  Dosage-Drug
prediction:  Form-Drug
prediction:  Route-Drug


In [None]:
# Individual RE model performance measure
pred = []
true = []
for key, sentence, relations in test_data:
  with torch.no_grad():
    sentence_in = prepare_sequence(sentence, word_to_ix)
    pred_relations = []
    for head, tail, _ in relations:
        param = sentence_in, head, tail
        y_pred = np.argmax(model(*param).numpy())
        pred.append(y_pred)
    true_relations = []
    for j in relations:
        true.append(relation_to_ix[j[2]])
print(f1_score(true, pred, average='weighted'))
class_names = ['Strength-Drug', 'Form-Drug', 'Route-Drug', 'Frequency-Drug', 'Reason-Drug', 'ADE-Drug',
                  'Dosage-Drug', 'Duration-Drug']
print(classification_report(true, pred, target_names=class_names))

0.9425704313809986
                precision    recall  f1-score   support

 Strength-Drug       0.96      0.96      0.96      4211
     Form-Drug       0.95      0.96      0.96      4304
    Route-Drug       0.98      0.94      0.96      3503
Frequency-Drug       0.98      0.98      0.98      3961
   Reason-Drug       0.83      0.96      0.89      2442
      ADE-Drug       0.80      0.50      0.61       607
   Dosage-Drug       0.95      0.94      0.95      2657
 Duration-Drug       0.93      0.84      0.88       393

      accuracy                           0.94     22078
     macro avg       0.92      0.89      0.90     22078
  weighted avg       0.94      0.94      0.94     22078



In [None]:
import pickle
with open("re_test", "rb") as fp:
  test_data_re = pickle.load(fp)

In [None]:
print(test_data_re[0:5])

[['0-1', ['He', 'received', '2mg', 'IV', 'ativan', 'with', 'somnolence', 'and', 'apenea'], [[[6, 7], [4, 5], 'ADE-Drug'], [[8, 9], [4, 5], 'ADE-Drug'], [[3, 4], [4, 5], 'Route-Drug'], [[2, 3], [4, 5], 'Strength-Drug']]], ['0-6', ['Patient', 'developed', 'SVT', 'in', 'the', 'cath', 'lab', 'and', 'was', 'treated', 'with', '9', 'mg', 'IV', 'metoprolol'], [[[2, 3], [14, 15], 'Reason-Drug'], [[13, 14], [14, 15], 'Route-Drug'], [[11, 13], [14, 15], 'Strength-Drug']]], ['0-11', ['Evidence', 'of', 'right', 'heart', 'strain,', 'manifest', 'as', 'reflux', 'of', 'IV', 'contrast'], [[[9, 10], [10, 11], 'Route-Drug']]], ['0-15', ['He', 'was', 'started', 'on', 'heparin', 'ggt', 'in', 'house', 'and', 'transitioned', 'to', 'an', 'anticoagulation', 'regimen', 'of', 'coumadin', ',', 'with', 'a', 'lovenox'], [[[5, 6], [4, 5], 'Route-Drug']]], ['0-22', ['He', 'was', 'maintained', 'on', 'a', 'valium', 'CIWA', 'and', 'klonipin', '1mg', 'TID'], [[[10, 11], [8, 9], 'Frequency-Drug'], [[9, 10], [8, 9], 'Streng

In [None]:
# end to end RE pipeline model performance measure
e2e_pred = []
e2e_true = []
for key, sentence, relations in test_data_re:
  with torch.no_grad():
    sentence_in = prepare_sequence(sentence, word_to_ix)
    pred_relations = []
    for head, tail, _ in relations:
        param = sentence_in, head, tail
        y_pred = np.argmax(model(*param).numpy())
        e2e_pred.append(y_pred)
    true_relations = []
    for j in relations:
        e2e_true.append(relation_to_ix[j[2]])
print(f1_score(e2e_true, e2e_pred, average='weighted'))
class_names = ['Strength-Drug', 'Form-Drug', 'Route-Drug', 'Frequency-Drug', 'Reason-Drug', 'ADE-Drug',
                  'Dosage-Drug', 'Duration-Drug']
print(classification_report(e2e_true, e2e_pred, target_names=class_names))

0.9854062611625906
                precision    recall  f1-score   support

 Strength-Drug       0.98      0.99      0.98      2961
     Form-Drug       0.99      1.00      0.99      3342
    Route-Drug       0.99      0.99      0.99      2700
Frequency-Drug       0.99      0.99      0.99      2612
   Reason-Drug       0.96      0.98      0.97      1270
      ADE-Drug       0.84      0.76      0.80       150
   Dosage-Drug       0.99      0.97      0.98      1993
 Duration-Drug       0.99      0.94      0.96       215

      accuracy                           0.99     15243
     macro avg       0.97      0.95      0.96     15243
  weighted avg       0.99      0.99      0.99     15243



## Hypermarameter tuning





In [None]:
1. optimiser: SGD, lr = 0.01, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.90, 13 epochs
1. optimiser: SGD, lr = 0.01, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.9063, 7 epochs
2. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.9485, 8 epochs
2. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.9538, 21 epochs
2. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.9471, 8 epochs
3. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 16, tolerance=5
  weighted avg f1: 0.9499, 9 epochs
3. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 16, tolerance=5
  weighted avg f1: 0.9497, 11 epochs
4. optimiser: Adam, lr = 0.1, HIDDEN_DIM = 16, tolerance=5
  weighted avg f1: 0.64, 10 epochs
5. optimiser: Adam, lr = 0.1, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.5843, 11 epochs
6. optimiser: SGD, lr = 1, HIDDEN_DIM = 64, tolerance=5
  weighted avg f1: 0.5168, 6 epochs
7. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 32, tolerance=5
  weighted avg f1: 0.9494, 12 epochs
8. optimiser: SGD, lr = 0.1, HIDDEN_DIM = 128, tolerance=5
  weighted avg f1: 0.9531, 26 epochs