# Installing Dependences

In [None]:
!pip install seqeval
!pip install -q -U tensorflow-text
!pip install -q tf-models-official

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 6.6MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp37-none-any.whl size=16184 sha256=9b13671f3759471a85c7c992da2c4fddbb449fb22657b191105d2e08ae2102f8
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
[K     |████████████████████████████████| 4.3MB 25.0MB/s 
[K     |████████████████████████████████| 1.6MB 26.8MB/s 
[K     |████████████████████████████████| 686kB 35.4MB/s 
[K     |████████████████████████████████| 102kB 10.2MB/s 
[K     |████████████████████████████████| 1.2MB 31.5MB/s 
[

# Importing and Mounting Drive

In [None]:
import os
from google.colab import drive
import numpy as np
import tensorflow as tf
from fastprogress.fastprogress import master_bar, progress_bar
import math
from seqeval.metrics import classification_report
import json
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
cd drive/My Drive/Mestrado Unicamp/MO444/P4/

/content/drive/My Drive/Mestrado Unicamp/MO444/P4


# Build Dataset

In [None]:
from bert_preprocessor import NerProcessor, convert_examples_to_features, build_dataset
from bert_modeling import BertConfig, BertModel
from tokenization import FullTokenizer
continue_training = True

train_batch_size = 8 #Mais do que isso pode dar Out of Memory
eval_batch_size = 64
num_train_epochs = 1

max_seq_length = 128
do_lower_case = False
eval_on = "dev"

data_dir = "conll2003/"
output_dir = "Fabio"
bert_model = "bert-base-cased"

warmup_proportion = 0.1
learning_rate = 5e-5
weight_decay = 0.01
adam_epsilon = 1e-8

processor = NerProcessor()
label_list = processor.get_labels()
num_labels = len(label_list) + 1
label_map = {i: label for i, label in enumerate(label_list, 1)}

train_examples = processor.get_train_examples(data_dir)
valid_examples = processor.get_valid_examples(data_dir)
test_examples = processor.get_test_examples(data_dir)

tokenizer = FullTokenizer(os.path.join(bert_model, "vocab.txt"), do_lower_case)
train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, tokenizer)
valid_features = convert_examples_to_features(
            valid_examples, label_list, max_seq_length, tokenizer)
test_features = convert_examples_to_features(
            test_examples, label_list, max_seq_length, tokenizer)
batched_train_data = build_dataset(train_features, train_batch_size)
batched_valid_data = build_dataset(valid_features, eval_batch_size)
batched_test_data = build_dataset(test_features, eval_batch_size)
print("  Num examples =", len(train_examples))
print("  Batch size =", train_batch_size)

num_train_optimization_steps = int(len(train_examples) / train_batch_size) * num_train_epochs
print("  Optimization will require", num_train_optimization_steps, "steps.")

07/14/2021 12:46:31 - INFO - bert_preprocessor -   *** Example ***
07/14/2021 12:46:31 - INFO - bert_preprocessor -   guid: train-0
07/14/2021 12:46:31 - INFO - bert_preprocessor -   tokens: EU rejects German call to boycott British la ##mb .
07/14/2021 12:46:31 - INFO - bert_preprocessor -   input_ids: 101 7270 22961 1528 1840 1106 21423 1418 2495 12913 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/14/2021 12:46:31 - INFO - bert_preprocessor -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/14/2021 12:46:31 - INFO - bert_preprocessor -   segment_ids: 0 0 0 0 0 0 0 0 0 0 

  Num examples = 14041
  Batch size = 8
  Optimization will require 1755 steps.


In [None]:
for input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask in batched_train_data:
  print("input_ids =", input_ids[0]) # Tokens ids
  print("input_mask =", input_mask[0])
  print("valid_ids =", valid_ids[0])
  print("segment_ids =", segment_ids[0]) # For our task, always 0
  print("label_ids =", label_ids[0]) # NER labels
  print("label_mask =", label_mask[0])
  break

input_ids = tf.Tensor(
[  101  7438 10532 26728  1855  1117  6880  1313  1197  1111  8850   119
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(128,), dtype=int64)
input_mask = tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# Model

In [None]:
class BertNer(tf.keras.Model):
    def __init__(self, bert_model,float_type, num_labels, max_seq_length):
        '''
        bert_model : string: bert pretrained model directory with bert_config.json and bert_model.ckpt
        float_type : tf.float32
        num_labels : num of tags in NER task
        max_seq_length : max_seq_length of tokens
        '''
        super(BertNer, self).__init__()
        assert type(bert_model) == str
        #Create the 3 inputs
        input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
        input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
        input_segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_segment_ids')
        bert_config = BertConfig.from_json_file(os.path.join(bert_model,"bert_config.json"))
        bert_layer = BertModel(config=bert_config,float_type=float_type)

        _, sequence_output = bert_layer(input_word_ids, input_mask, input_segment_ids)

        self.bert = tf.keras.Model(inputs=[input_word_ids, input_mask, input_segment_ids], outputs=[sequence_output])
        init_checkpoint = os.path.join(bert_model,"bert_model.ckpt")
        checkpoint = tf.train.Checkpoint(model=self.bert)
        checkpoint.restore(init_checkpoint).assert_existing_objects_matched()

        self.dropout = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)

        initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
        self.classifier = tf.keras.layers.Dense(
            num_labels, kernel_initializer=initializer, activation='softmax', name='output', dtype=float_type)

        
    def call(self, input_word_ids, input_mask=None, input_segment_ids=None, valid_ids=None, training=False, **kwargs):
        sequence_output = self.bert([input_word_ids, input_mask, input_segment_ids],**kwargs)
        valid_output = []
        for i in range(sequence_output.shape[0]):
            r = 0
            temp = []
            for j in range(sequence_output.shape[1]):
                if valid_ids[i][j] == 1:
                    temp = temp + [sequence_output[i][j]]
                else:
                    r += 1
            temp = temp + r * [tf.zeros_like(sequence_output[i][j])]
            valid_output = valid_output + temp
        valid_output = tf.reshape(tf.stack(valid_output),sequence_output.shape)
        sequence_output = self.dropout(valid_output, training=training)
        logits = self.classifier(sequence_output)
        return logits

ner = BertNer(bert_model, tf.float32, num_labels, max_seq_length)
if continue_training:
  ids = tf.ones((1,128),dtype=tf.int64)
  _ = ner(ids,ids,ids,ids, training=False) # Roda uma vez para carregar pesos
  ner.load_weights(os.path.join(output_dir,"model.h5"))

# Training

In [None]:
from optimization import AdamWeightDecay, WarmUp

warmup_steps = int(warmup_proportion * num_train_optimization_steps)
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=learning_rate,
                                                decay_steps=num_train_optimization_steps,end_learning_rate=0.0)
if warmup_steps:
  learning_rate_fn = WarmUp(initial_learning_rate=learning_rate,
                            decay_schedule_fn=learning_rate_fn,
                            warmup_steps=warmup_steps)

optimizer = AdamWeightDecay(learning_rate=learning_rate_fn,
                            weight_decay_rate=weight_decay,
                            epsilon=adam_epsilon,
                            exclude_from_weight_decay=['layer_norm', 'bias'])

loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)

loss_metric = tf.keras.metrics.Mean()
epoch_bar = master_bar(range(num_train_epochs))

In [None]:
pb_max_len = math.ceil(float(len(train_features))/float(train_batch_size))

def train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids,label_mask):
    with tf.GradientTape() as tape:
        logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=True)
        label_ids_masked = tf.boolean_mask(label_ids, label_mask)
        logits_masked = tf.boolean_mask(logits, label_mask)
        loss = loss_fct(label_ids_masked, logits_masked)

    #Backprop
    grads = tape.gradient(loss, ner.trainable_variables)
    optimizer.apply_gradients(list(zip(grads, ner.trainable_variables)))
    return loss

for epoch in epoch_bar:
    for (input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) in progress_bar(batched_train_data, total=pb_max_len, parent=epoch_bar):
        loss = train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask)
        loss_metric(loss)
        epoch_bar.child.comment = f'loss : {loss_metric.result()}'
    loss_metric.reset_states()

(8, 128)


KeyboardInterrupt: ignored

## Saving

In [None]:
ner.save_weights(os.path.join(output_dir,"model.h5"))
model_config = {"bert_model":bert_model,"do_lower":do_lower_case,
                "max_seq_length":max_seq_length,"num_labels":num_labels,
                "label_map":label_map}
json.dump(model_config, open(os.path.join(output_dir,"model_config.json"),"w"),indent=4)

# Loading

In [None]:
ner = BertNer(bert_model, tf.float32, num_labels, max_seq_length)
ids = tf.ones((1,128),dtype=tf.int64)
_ = ner(ids,ids,ids,ids, training=False)
ner.load_weights(os.path.join(output_dir,"model.h5"))

# Evaluate

In [None]:
if eval_on == "dev":
    batched_eval_data = batched_valid_data
    pb_max_len = math.ceil(float(len(valid_features))/float(eval_batch_size))
elif eval_on == "test":
    batched_eval_data = batched_test_data
    pb_max_len = math.ceil(float(len(test_features))/float(eval_batch_size))
print("eval_on =", eval_on)

loss_metric = tf.keras.metrics.Mean()
epoch_bar = master_bar(range(1))

y_true = []
y_pred = []
for epoch in epoch_bar:
    for (input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) in progress_bar(batched_eval_data, total=pb_max_len, parent=epoch_bar):
            logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=False)
            logits = tf.argmax(logits,axis=2)
            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j].numpy() == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j].numpy()])
                        temp_2.append(label_map[logits[i][j].numpy()])
report = classification_report(y_true, y_pred,digits=4)
print(report)

eval_on = dev


KeyboardInterrupt: ignored

# Testing on random phrases

In [None]:
def test_Preprocessor(list_of_texts):
    result = []
    for text in list_of_texts:
        tokens = []
        valid_ids = []
        textlist = text.split(' ')
        for i, word in enumerate(textlist):
            token = tokenizer.tokenize(word)
            tokens.extend(token)
            for m in range(len(token)):
                if m == 0:
                    valid_ids.append(1)
                else:
                    valid_ids.append(0)
            if len(tokens) >= max_seq_length - 1:
                tokens = tokens[0:(max_seq_length - 2)]
                valid_ids = valid_ids[0:(max_seq_length - 2)]
        ntokens = []
        segment_ids = []
        label_ids = []
        ntokens.append("[CLS]")
        segment_ids.append(0)
        valid_ids.insert(0, 1)
        for i, token in enumerate(tokens):
            ntokens.append(token)
            segment_ids.append(0)
        ntokens.append("[SEP]")
        segment_ids.append(0)
        valid_ids.append(1)
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_ids.append(1)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(valid_ids) == max_seq_length
        assert len(segment_ids) == max_seq_length

        result.append([tokens, input_ids, input_mask, segment_ids, valid_ids])
    return result

def anonymizer(predicted_labels, input_ids, valid_ids, input_mask, 
               anonymize_list = ['B-ORG', 'I-ORG', 'B-PER', 'I-PER']):
  sentence = []
  i = 0
  for label in predicted_labels:
      while valid_ids[i] == 0:
        sentence.append(input_ids[i].numpy())
        i += 1
        if i == len(input_mask)-1: break
        if input_mask[i] == 0: break
      if label in anonymize_list:
        sentence.append(0)
      else:
        sentence.append(input_ids[i].numpy())
      i += 1
      if i == len(input_mask)-1: break
      if input_mask[i] == 0: break
  return tokenizer.convert_ids_to_tokens(sentence)

In [None]:
test_phrases = ["In 1975, Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975, to develop and sell BASIC interpreters for the Altair 8800.",]
for tokens, input_ids, input_mask, segment_ids, valid_ids in test_Preprocessor(test_phrases):
    print(tokens)
    print(input_ids)
    print(input_mask)
    print(segment_ids)
    print(valid_ids)
    input_ids = tf.constant([input_ids], dtype="int64")
    input_mask = tf.constant([input_mask], dtype="int64")
    segment_ids = tf.constant([segment_ids], dtype="int64")
    valid_ids = tf.constant([valid_ids], dtype="int64")
    logits = ner(input_word_ids=input_ids, input_mask=input_mask, input_segment_ids=segment_ids, valid_ids=valid_ids, training=False)
    logits = tf.argmax(logits,axis=2)
    predicted_labels = [label_map[i] for i in logits.numpy().tolist()[0]]
    print(predicted_labels)
    print(anonymizer(predicted_labels, input_ids[0], valid_ids[0], input_mask[0]))

['In', '1975', ',', 'Microsoft', 'was', 'founded', 'by', 'Bill', 'Gates', 'and', 'Paul', 'Allen', 'on', 'April', '4', ',', '1975', ',', 'to', 'develop', 'and', 'sell', 'BA', '##SI', '##C', 'interpreter', '##s', 'for', 'the', 'Alt', '##air', '88', '##00', '.']
[101, 1130, 2429, 117, 6998, 1108, 1771, 1118, 2617, 12702, 1105, 1795, 4522, 1113, 1364, 125, 117, 2429, 117, 1106, 3689, 1105, 4582, 12465, 13882, 1658, 23102, 1116, 1111, 1103, 14983, 8341, 5385, 7629, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,