# **Kaggle Address**
[link](https://www.kaggle.com/bradbolliger/gmb-v220)

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [None]:
!ls ../input/gmb-v220/gmb-2.2.0/data

In [None]:
import os

data_root = "../input/gmb-v220/gmb-2.2.0/data"
fnames = []
for root, dirs, files in os.walk(data_root):
    for filename in files:
        if filename.endswith(".tags"):
            fnames.append(os.path.join(root, filename))
fnames[:2]

In [None]:
!mkdir ner

In [None]:
import csv
import collections

ner_tags = collections.Counter()
iob_tags = collections.Counter()

def strip_ner_subcat(tag):
    # NER tags are of form {cat}-{subcat}
    # eg tim-dow. We only want first part
    return tag.split("-")[0]

def iob_format(ners):
    # converts IO tags into IOB format
    # input is a sequence of IO NER tokens
    # convert this: O, PERSON, PERSON, O, O, LOCATION, O
    # into: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    iob_tokens = []
    for idx, token in enumerate(ners):
        if token != 'O': # !other
            if idx == 0:
                token = "B-" + token #start of sentence
            elif ners[idx-1] == token:
                token = "I-" + token # continues
            else:
                token = "B-" + token
        iob_tokens.append(token)
        iob_tags[token] += 1
    return iob_tokens

total_sentences = 0
outfiles = []
for idx, file in enumerate(fnames):
    with open(file, "rb") as content:
        data = content.read().decode("utf-8").strip()
        sentences = data.split("\n\n")
        print(idx, file, len(sentences))
        total_sentences += len(sentences)

        with open("./ner/"+str(idx)+"-"+os.path.basename(file),"w") as outfile:
            outfiles.append("./ner/"+str(idx)+"-"+os.path.basename(file))
            writer = csv.writer(outfile)

            for sentence in sentences:
                toks = sentence.split("\n")
                words, pos, ner = [], [], []
                for tok in toks:
                    t = tok.split("\t")
                    words.append(t[0])
                    pos.append(t[1])
                    ner_tags[t[3]] += 1
                    ner.append(strip_ner_subcat(t[3]))
                writer.writerow([" ".join(words), " ".join(iob_format(ner)), " ".join(pos)])

In [None]:
print("total number of sentences: ", total_sentences)

In [None]:
print(ner_tags)
print(iob_tags)

# **Normalizing and vectorizing**

In [None]:
import glob
import pandas as pd
# could use `outfiles` param as well
files = glob.glob("./ner/*.tags")
data_pd = pd.concat([pd.read_csv(f, header=None, names=["text", "label", "pos"]) 
                     for f in files], ignore_index = True)

In [None]:
data_pd.info()

In [None]:
data_pd.head()

In [None]:
### Keras tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

# oov_token -> unknown, Out-Of-Vocab?
text_tok = Tokenizer(filters='[\\]^\t\n', lower=False,
                     split=' ', oov_token='<OOV>')

pos_tok = Tokenizer(filters='\t\n', lower=False,
                    split=' ', oov_token='<OOV>')

ner_tok = Tokenizer(filters='\t\n', lower=False,
                    split=' ', oov_token='<OOV>')

In [None]:
text_tok.fit_on_texts(data_pd['text'])
pos_tok.fit_on_texts(data_pd['pos'])
ner_tok.fit_on_texts(data_pd['label'])

In [None]:
ner_config = ner_tok.get_config()
text_config = text_tok.get_config()
print("ner_config:\n", ner_config)
# print("text_config:\n", text_config)
# print("pos_config:\n", pos_config)

In [None]:
# index to word
text_vocab = eval(text_config['index_word'])
ner_vocab = eval(ner_config['index_word'])

print("Unique words in vocab:", len(text_vocab))
print("Unique NER tags in vocab:", len(ner_vocab))

In [None]:
x_tok = text_tok.texts_to_sequences(data_pd['text'])
y_tok = ner_tok.texts_to_sequences(data_pd['label'])

In [None]:
from tensorflow.keras.preprocessing import sequence
max_len = 50
x_pad = sequence.pad_sequences(x_tok, padding='post', maxlen=max_len)
y_pad = sequence.pad_sequences(y_tok, padding='post', maxlen=max_len)
print(x_pad.shape, y_pad.shape)

In [None]:
num_classes = len(ner_vocab) + 1

# One-Hot
Y = tf.keras.utils.to_categorical(y_pad, num_classes=num_classes)
Y.shape

# **BiLSTM**

In [None]:
# Length of the vocabulary
vocab_size = len(text_vocab) + 1

# The embedding dimension
embedding_dim = 64

# Number of RNN units
rnn_units = 100

#batch size
BATCH_SIZE=90

# num of NER classes
num_classes = len(ner_vocab)+1

In [None]:
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense

def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, classes):
    model = tf.keras.Sequential([
        Embedding(vocab_size, embedding_dim, mask_zero=True,
                                  batch_input_shape=[batch_size, None]),
        Bidirectional(LSTM(units=rnn_units, 
                               return_sequences=True, 
                               dropout=0.5,  
                               kernel_initializer=tf.keras.initializers.he_normal())),
        #  LSTM(rnn_units, return_sequences=True, 
        #           dropout=0.5, 
        #           recurrent_dropout=0.5),
        TimeDistributed(Dense(rnn_units, activation="relu")),
        Dense(num_classes, activation="softmax")
#         tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    return model

In [None]:
model = build_model_bilstm(
                        vocab_size = vocab_size,
                        embedding_dim=embedding_dim,
                        rnn_units=rnn_units,
                        batch_size=BATCH_SIZE,
                        classes=num_classes)
model.summary()
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
# to enable TensorFlow to process sentences properly
X = x_pad
# create training and testing splits
total_sentences = 62010
test_size = round(total_sentences / BATCH_SIZE * 0.2)
X_train = X[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]

X_test = X[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]

In [None]:
model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=15)

In [None]:
model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)

In [None]:
y_pred = model.predict(X_test, batch_size=BATCH_SIZE)

In [None]:
text_tok.sequences_to_texts([X_test[1]])

In [None]:
ner_tok.sequences_to_texts([y_pad[1]])

In [None]:
y_pred = tf.argmax(y_pred, -1)
y_pred.shape

In [None]:
y_pnp = y_pred.numpy()
ner_tok.sequences_to_texts([y_pnp[1]])

# **BiLSTM-CRF**

In [None]:
import tensorflow_addons as tfa
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

In [None]:
class CRFLayer(Layer):
    """
    Computes the log likelihood during training
    Performs Viterbi decoding during prediction
    """
    def __init__(self,
               label_size,
               mask_id=0,
               trans_params=None,
               name='crf',
               **kwargs):
        super(CRFLayer, self).__init__(name=name, **kwargs)
        self.label_size = label_size
        self.mask_id = mask_id
        self.transition_params = None

        if trans_params is None:  # not reloading pretrained params
            self.transition_params = tf.Variable(tf.random.uniform(shape=(label_size, label_size)),
                                             trainable=False)
        else:
            self.transition_params = trans_params

    def get_seq_lengths(self, matrix):
        # matrix is of shape (batch_size, max_seq_len)
        mask = tf.not_equal(matrix, self.mask_id)
        seq_lengths = tf.math.reduce_sum(
                                        tf.cast(mask, dtype=tf.int32), 
                                        axis=-1)
        return seq_lengths

    def call(self, inputs, seq_lengths, training=None):
        if training is None:
            training = K.learning_phase()

        # during training, this layer just returns the logits
        if training:
            return inputs

        # viterbi decode logic to return proper 
        # results at inference
        _, max_seq_len, _ = inputs.shape
        seqlens = seq_lengths
        paths = []
        for logit, text_len in zip(inputs, seqlens):
            viterbi_path, _ = tfa.text.viterbi_decode(logit[:text_len], 
                                                  self.transition_params)
            paths.append(self.pad_viterbi(viterbi_path, max_seq_len))

        return tf.convert_to_tensor(paths) 

    def pad_viterbi(self, viterbi, max_seq_len):
        if len(viterbi) < max_seq_len:
            viterbi = viterbi + [self.mask_id] * (max_seq_len - len(viterbi))
        return viterbi

    def get_proper_labels(self, y_true):
        shape = y_true.shape
        if len(shape) > 2:
            return tf.argmax(y_true, -1, output_type=tf.int32)
        return y_true

    def loss(self, y_true, y_pred):
        y_pred = tf.convert_to_tensor(y_pred)
        y_true = tf.cast(self.get_proper_labels(y_true), y_pred.dtype)

        seq_lengths = self.get_seq_lengths(y_true)
        log_likelihoods, self.transition_params = tfa.text.crf_log_likelihood(y_pred, 
                                                                    y_true, seq_lengths)
        # save transition params
        self.transition_params = tf.Variable(self.transition_params, trainable=False)
        # calc loss
        loss = - tf.reduce_mean(log_likelihoods)
        return loss

In [None]:
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.layers import Dropout, Bidirectional
from tensorflow.keras import backend as K

class NerModel(tf.keras.Model):
    def __init__(self, hidden_num, vocab_size, label_size, embedding_size,
                name='BilstmCrfModel', **kwargs):
        super(NerModel, self).__init__(name=name, **kwargs)
        self.num_hidden = hidden_num
        self.vocab_size = vocab_size
        self.label_size = label_size

        self.embedding = Embedding(vocab_size, embedding_size, 
                                   mask_zero=True, name="embedding")
        self.biLSTM =Bidirectional(LSTM(hidden_num, return_sequences=True), name="bilstm")
        self.dense = TimeDistributed(tf.keras.layers.Dense(label_size), name="dense")
        self.crf = CRFLayer(self.label_size, name="crf")

    def call(self, text, labels=None, training=None):
        # zeros in padded sequence is None
        seq_lengths = tf.math.reduce_sum(tf.cast(tf.math.not_equal(text, 0), 
                                               dtype=tf.int32), axis=-1) 
        
        if training is None:
            training = K.learning_phase()

        inputs = self.embedding(text)
        bilstm = self.biLSTM(inputs)
        logits = self.dense(bilstm)
        outputs = self.crf(logits, seq_lengths, training)
        
        return outputs

In [None]:
# Length of the vocabulary in chars
vocab_size = len(text_vocab)+1 # len(chars)

# The embedding dimension
embedding_dim = 64

# Number of RNN units
rnn_units = 100

#batch size
BATCH_SIZE=90

# num of NER classes
num_classes = len(ner_vocab)+1

blc_model = NerModel(rnn_units, vocab_size, num_classes, embedding_dim, dynamic=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [None]:
# create training and testing splits
total_sentences = 62010
test_size = round(total_sentences / BATCH_SIZE * 0.2)
X_train = x_pad[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]

X_test = x_pad[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]
Y_train_int = tf.cast(Y_train, dtype=tf.int32)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train_int))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
loss_metric = tf.keras.metrics.Mean()

epochs = 5

# Iterate over epochs.
for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # Iterate over the batches of the dataset.
    for step, (text_batch, labels_batch) in enumerate(train_dataset):
        labels_max = tf.argmax(labels_batch, -1, output_type=tf.int32)
        with tf.GradientTape() as tape:
            logits = blc_model(text_batch, training=True)
            # the custom loss
            loss = blc_model.crf.loss(labels_max, logits)

            grads = tape.gradient(loss, blc_model.trainable_weights)
            optimizer.apply_gradients(zip(grads, blc_model.trainable_weights))
            
            loss_metric(loss)
        if step % 50 == 0:
            print('step %s: mean loss = %s' % (step, loss_metric.result()))

In [None]:
Y_test_int = tf.cast(Y_test, dtype=tf.int32)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test_int))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

out = blc_model.predict(test_dataset.take(1))

In [None]:
text_tok.sequences_to_texts([X_test[2]])

In [None]:
print("Ground Truth: ",
ner_tok.sequences_to_texts([tf.argmax(Y_test[2],-1).numpy()]))
print("Prediction: ", ner_tok.sequences_to_texts([out[2]]))

In [None]:
def np_precision(pred, true):
    # expect numpy arrays
    assert pred.shape == true.shape
    assert len(pred.shape) == 2
    mask_pred = np.ma.masked_equal(pred, 0)
    mask_true = np.ma.masked_equal(true, 0)
    acc = np.equal(mask_pred, mask_true)
    return np.mean(acc.compressed().astype(int))

In [None]:
np_precision(out, tf.argmax(Y_test[:BATCH_SIZE], -1).numpy())