### Read data, load packages and utility functions

In [None]:
!pip -q install mead-baseline gensim
!pip -q install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite
from gensim.models.keyedvectors import KeyedVectors
from baseline.utils import to_chunks
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
import os
from tqdm import tqdm
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.optimizers
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Sequential

In [None]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

# Util function
def read_conll_file(_file):
    all_sentences = []
    all_sentence_ids = [] 
    all_labels = []
    sentence = []
    labels = []
    for line in tqdm(open(_file), desc=f"reading {_file}"):
        if line.startswith("#"):
            all_sentence_ids.append(re.split("\\s+", line.strip())[1])
            continue
        if not line.strip():
            all_sentences.append(sentence)
            all_labels.append(labels)
            sentence = []
            labels = []
        else:
            line = line.strip()
            sentence.append(re.split("\\s+", line)[0])
            labels.append(re.split("\\s+", line)[1])
    if sentence and labels:
        all_sentences.append(sentence)
        all_labels.append(labels)
    return all_sentence_ids, all_sentences, all_labels


def shorten_sentence_label(sentence_tokens, true_labels, pred_labels, maxlen):
    if maxlen == -1: # we need to shorten the labels to the sentence length
        shorten_to = len(sentence_tokens)
    else: # we have to shorten either the sentence to the max len or the sequence to the sentence length
        if len(sentence_tokens) > maxlen:
            shorten_to = maxlen
        else:
            shorten_to = len(sentence_tokens)
    return sentence_tokens[:shorten_to], true_labels[:shorten_to], pred_labels[:shorten_to]



def generate_conll(sentence_ids,all_sentence_tokens, all_sentence_true_labels, all_sentence_pred_labels, output_base, 
                   maxlen=-1):
    assert len(sentence_ids) == len(all_sentence_tokens) == len(all_sentence_true_labels) == len(all_sentence_pred_labels)
    with open(f"{output_base}.conll", "w") as wf:
        for sentence_tokens, sentence_true_labels, sentence_pred_labels in zip(all_sentence_tokens, all_sentence_true_labels, all_sentence_pred_labels):
            sentence_tokens, sentence_true_labels, sentence_pred_labels = shorten_sentence_label(
                sentence_tokens, sentence_true_labels, sentence_pred_labels, maxlen)
            assert len(sentence_tokens) == len(sentence_true_labels) == len(sentence_pred_labels), \
            f"{len(sentence_tokens)}, {len(sentence_true_labels)}, {len(sentence_pred_labels)}"
            for token, true_label, pred_label in zip(sentence_tokens, sentence_true_labels, sentence_pred_labels):
                wf.write(f"{token} {true_label} {pred_label}\n")
                wf.write("\n")
    print(f"generated conll file {output_base}.conll")

def generate_labelseq(sentence_ids, all_sentence_tokens, all_sentence_pred_labels, output_base, maxlen=-1):
    assert len(sentence_ids) == len(all_sentence_tokens) == len(all_sentence_pred_labels)
    with open(f"{output_base}_labelseq.txt", "w") as wf:
        wf.write("ID\tTAGSEQ\n")
        for sentence_id, sentence_tokens, sentence_labels in zip(sentence_ids, all_sentence_tokens, all_sentence_pred_labels):
            sentence_tokens, _, sentence_labels = shorten_sentence_label(sentence_tokens, sentence_labels, sentence_labels, maxlen)
            assert len(sentence_tokens) == len(sentence_labels)
            wf.write(f'{sentence_id}\t{" ".join(sentence_labels)}\n')
        print(f"generated labelseq file {output_base}.labelseq")

def generate_human_readable(sentence_ids, all_sentence_tokens, all_sentence_pred_labels, output_base, maxlen=-1):
    def create_chunk(tokens, chunk_def):
            chunk_type, chunk_indices = chunk_def.split("@")[0], [int(x) for x in chunk_def.split("@")[1:]]
            chunk_indices = chunk_indices + [chunk_indices[-1]+1]
            # return f"{chunk_type}: {' '.join(tokens[chunk_indices[0]: chunk_indices[-1]])}"
            return f"{' '.join(tokens[chunk_indices[0]: chunk_indices[-1]])}"

    assert len(sentence_ids) == len(all_sentence_tokens) == len(all_sentence_pred_labels)
    with open(f"{output_base}_human.txt", "w") as wf:
        wf.write("Term\n")
        for sentence_id, sentence_tokens, sentence_labels in zip(sentence_ids, all_sentence_tokens, all_sentence_pred_labels):
            # wf.write(f"[id]: {sentence_id}\n")
            # wf.write(f"{sentence_id}\t")
            sentence_tokens, _, sentence_labels = shorten_sentence_label(sentence_tokens, sentence_labels, sentence_labels, maxlen)
            assert len(sentence_tokens) == len(sentence_labels)
            # wf.write(f"[sentence]: {' '.join(sentence_tokens)}")
            chunks = to_chunks(sentence_labels, span_type="iob") 
            for chunk in chunks:
                wf.write(create_chunk(sentence_tokens, chunk)+ ", ")
            wf.write("\n")
        print(f"generated labelseq file {output_base}.human")


def predict_tags_for_file(_file, model, _word2idx, _label2idx, output_base, output_formats=["human_readable", "labelseq"]):
    sentence_ids, sen_texts = read_file(_file)
    X = [[word2idx[j] for j in i] for i in sen_texts]
    X = pad_sequences(maxlen = max(len(x) for x in sen_texts), sequences = X, padding = "post", value = _word2idx["PAD"])
    
    Y_pred = np.argmax(model.predict(X), axis=-1)
    
    Y_pred_labels = [[idx2label[i] for i in row] for row in Y_pred]
    if "labelseq" in output_formats:
        generate_labelseq(
            sentence_ids=sentence_ids,
            all_sentence_tokens=sen_texts, 
            all_sentence_pred_labels=Y_pred_labels, 
            output_base=output_base,
            maxlen=MAXLEN
        )
    if "human_readable" in output_formats:
         generate_human_readable(
            sentence_ids=sentence_ids,
            all_sentence_tokens=sen_texts, 
            all_sentence_pred_labels=Y_pred_labels, 
            output_base=output_base,
            maxlen=MAXLEN
        )

In [None]:
if not os.path.exists("review_data"):
    !wget https://www.dropbox.com/s/yqgff7de73iwosr/review_data.zip?dl=1 -O review_data.zip
    !unzip review_data.zip
    !ls review_data 

In [None]:
id, X = read_file("review_data/REVIEW_TEXT.txt")
id, y = read_file("review_data/REVIEW_LABELSEQ.txt") 
id, test_text = read_file("review_data/TEST_REVIEW_TEXT.txt") 

train_text, valid_text, train_labels, valid_labels = train_test_split(X, y, test_size = 0.2)

### Data preprocessing, Sequence padding

In [None]:
unique_words = set([j for i in train_text + valid_text + test_text for j in i])
word2idx = {j:i+1 for i,j in enumerate(unique_words)}
word2idx["PAD"] = 0
print(f"{len(word2idx)} tokens in vocab")

unique_labels = set([j for i in train_labels for j in i])
unique_labels_valid = set([j for i in train_labels for j in i])
unique_labels_test = set([j for i in train_labels for j in i])

# make sure there are no labels in valid/test that are not in train.
assert not unique_labels_valid - unique_labels, unique_labels_valid - unique_labels
assert not unique_labels_test - unique_labels, unique_labels_test - unique_labels

label2idx = {'PAD': 0}
for i,j in enumerate(unique_labels):
    label2idx[j] = i+1 
idx2label = {j:i for i,j in label2idx.items()}
print(idx2label)

MAXLEN = max(len(x) for x in X)

def get_padded_x_y(text, labels, _maxlen, _word2idx, _label2idx):
    X = [[word2idx[j] for j in i] for i in text]
    X = pad_sequences(maxlen = _maxlen, sequences = X, padding = "post", value = _word2idx["PAD"])
    Y = [[label2idx[j] for j in i] for i in labels]
    Y = pad_sequences(maxlen = _maxlen, sequences = Y, padding = "post", value = _label2idx["PAD"])
    Y = [to_categorical(i, num_classes = len(label2idx)) for i in Y]
    assert len(X) == len(Y), "X and Y should be of the same shape"
    return X, Y

X_train, Y_train = get_padded_x_y(train_text, train_labels, _maxlen=MAXLEN, _word2idx=word2idx, _label2idx=label2idx)
X_valid, Y_valid = get_padded_x_y(valid_text, valid_labels, _maxlen=MAXLEN, _word2idx=word2idx, _label2idx=label2idx)
# X_test, Y_test = get_padded_x_y(test_text, test_labels, _maxlen=MAXLEN, _word2idx=word2idx, _label2idx=label2idx)

### LSTM model (without embeddings)

In [None]:
model = Sequential()
EMBED_DIM = 300
RNN_HIDDEN_DIM = 300
model.add(Embedding(input_dim=len(word2idx.keys()),output_dim=EMBED_DIM,input_length=MAXLEN))
model.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.1), merge_mode = 'concat'))
model.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.13), merge_mode = 'concat'))
model.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.16), merge_mode = 'concat'))
model.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model.add(Dense(len(label2idx.keys()), activation="softmax"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

In [None]:
import numpy as np
# history = model.fit(X_train,np.array(Y_train),batch_size=16,epochs=3,validation_data=(X_valid, np.array(Y_valid)))
history = model.fit(X_train, np.array(Y_train), batch_size=16, epochs=10, validation_split=0.2)

In [None]:
Y_valid_pred = model.predict(X_valid)
Y_valid_pred = np.argmax(Y_valid_pred, axis=-1)
Y_valid_true = np.argmax(Y_valid, -1)
Y_valid_pred_labels = [[idx2label[i] for i in row] for row in Y_valid_pred]
Y_valid_true_labels = [[idx2label[i] for i in row] for row in Y_valid_true]

In [None]:
labels = ['B-AE','B-SSI','I-AE', 'I-SSI', 'O']

report = flat_classification_report(y_pred=Y_valid_pred_labels, y_true=Y_valid_true_labels,
                                    labels = labels)
f1_score = flat_f1_score(y_pred=Y_valid_pred_labels, y_true=Y_valid_true_labels, average = 'macro', labels = labels)
print(f'Model 7\nF1_Score (macro): {f1_score:.3f}\nClassification report:\n{report}')

In [None]:
batchsz = 16

predict_tags_for_file("review_data/TEST_REVIEW_TEXT.txt", model=model, _label2idx=label2idx, _word2idx=word2idx, output_base="test_output", 
                      output_formats=["human_readable", "labelseq"])


### try word embeddings (glove-840b-300d)

In [None]:
!gdown 1-2HyX0Ak1rOKa7UG6_mfnZY0Lvk-NTwz
path_to_glove_file = 'glove.840B.300d.txt'


embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(word2idx)
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    try:
        embedding_matrix[i] = embedding_vector
        hits += 1
    except:
        misses += 1
        pass
print("Converted %d words (%d misses)" % (hits, misses))

# some words generated nan embeddings. 
# Check for nan values and fill with 0
check_nan = np.isnan(embedding_matrix)
embedding_matrix[check_nan] = 0

In [None]:
model_emb = Sequential()
EMBED_DIM = 300
RNN_HIDDEN_DIM = 300
model_emb.add(Embedding(input_dim=len(word2idx.keys()),output_dim=EMBED_DIM,input_length=MAXLEN,
                    embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False))
model_emb.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model_emb.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.16), merge_mode = 'concat'))
model_emb.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.13), merge_mode = 'concat'))
model_emb.add(Dense(len(label2idx.keys()), activation="softmax"))

model_emb.compile(loss='categorical_crossentropy', optimizer= "adam", metrics=['acc'])
model_emb.summary()

In [None]:
history = model_emb.fit(X_train, np.array(Y_train), batch_size=16, epochs=10, validation_split=0.2)

In [None]:
Y_valid_pred = model_emb.predict(X_valid)
Y_valid_pred = np.argmax(Y_valid_pred, axis=-1)
Y_valid_true = np.argmax(Y_valid, -1)
Y_valid_pred_labels = [[idx2label[i] for i in row] for row in Y_valid_pred]
Y_valid_true_labels = [[idx2label[i] for i in row] for row in Y_valid_true]

labels = ['B-AE','B-SSI','I-AE', 'I-SSI', 'O']

report = flat_classification_report(y_pred=Y_valid_pred_labels, y_true=Y_valid_true_labels,
                                    labels = labels)
f1_score = flat_f1_score(y_pred=Y_valid_pred_labels, y_true=Y_valid_true_labels, average = 'macro', labels = labels)
print(f'Model 9\nF1_Score (macro): {f1_score:.3f}\nClassification report:\n{report}')

### Try word embedding (BioWordVec) 

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin

In [None]:
# convert binary word embedding file to txt file (this cell may take ~30 mins to run in colab environment)

from gensim.models.keyedvectors import KeyedVectors

path = "/content/BioWordVec_PubMed_MIMICIII_d200.vec.bin"

model = KeyedVectors.load_word2vec_format(path, binary=True)
model.save_word2vec_format("BioWordVec.txt", binary=False)

In [None]:
if not os.path.exists("BioWordVec.txt"):
    !wget https://www.dropbox.com/s/yqgff7de73iwosr/review_data.zip?dl=1 -O review_data.zip
    !unzip review_data.zip
    !ls review_data 

In [None]:
path = 'BioWordVec.txt'

embeddings_index = {}
with open(path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(word2idx)
embedding_dim = 200 #biowordvec is 200d
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    try:
        embedding_matrix[i] = embedding_vector
        hits += 1
    except:
        misses += 1
        pass
print("Converted %d words (%d misses)" % (hits, misses))

# some words generated nan embeddings. 
# Check for nan values and fill with 0
check_nan = np.isnan(embedding_matrix)
embedding_matrix[check_nan] = 0

In [None]:
model_emb = Sequential()
EMBED_DIM = 200
RNN_HIDDEN_DIM = 300
model_emb.add(Embedding(input_dim=len(word2idx.keys()),output_dim=EMBED_DIM,input_length=MAXLEN,
                    embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=True))
model_emb.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model_emb.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.16), merge_mode = 'concat'))
model_emb.add(Bidirectional(LSTM(units=RNN_HIDDEN_DIM,return_sequences=True,dropout=0.13), merge_mode = 'concat'))
model_emb.add(Dense(len(label2idx.keys()), activation="softmax"))

model_emb.compile(loss='categorical_crossentropy', optimizer= "adam", metrics=['acc'])
model_emb.summary()

history = model_emb.fit(X_train, np.array(Y_train), batch_size=16, epochs=10, validation_split=0.2)

In [None]:
Y_valid_pred = model_emb.predict(X_valid)
Y_valid_pred = np.argmax(Y_valid_pred, axis=-1)
Y_valid_true = np.argmax(Y_valid, -1)
Y_valid_pred_labels = [[idx2label[i] for i in row] for row in Y_valid_pred]
Y_valid_true_labels = [[idx2label[i] for i in row] for row in Y_valid_true]

labels = ['B-AE','B-SSI','I-AE', 'I-SSI', 'O']

report = flat_classification_report(y_pred=Y_valid_pred_labels, y_true=Y_valid_true_labels,
                                    labels = labels)
f1_score = flat_f1_score(y_pred=Y_valid_pred_labels, y_true=Y_valid_true_labels, average = 'macro', labels = labels)
print(f'Model 11\nF1_Score (macro): {f1_score:.3f}\nClassification report:\n{report}')

### generate final prediction (model used: Glove embedding) 

In [None]:
batchsz = 16

# human readable has been modified to include only texts from the prediction made
predict_tags_for_file("review_data/TEST_REVIEW_TEXT.txt", model=model_emb, _label2idx=label2idx, _word2idx=word2idx, output_base="test_output", 
                      output_formats=["human_readable", "labelseq"])