In [1]:
!rm -rf bert
!git clone https://github.com/google-research/bert bert

Cloning into 'bert'...
remote: Enumerating objects: 336, done.[K
remote: Total 336 (delta 0), reused 0 (delta 0), pack-reused 336[K
Receiving objects: 100% (336/336), 297.11 KiB | 95.00 KiB/s, done.
Resolving deltas: 100% (183/183), done.


In [1]:
from keras.utils import to_categorical
import copy 

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
#     bert_module =  hub.load(bert_path)

    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
#     tokenization_info = bert_module.signatures["tokenization_info"]

    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        labels = [tags["-PAD-"]] * max_seq_length
        return input_ids, input_mask, segment_ids, labels

    new_labels = copy.deepcopy(example.labels)
    tokens_a = tokenizer.tokenize(example.text_a)
    
    for idx, t in enumerate(tokens_a):
        try:
            dummy = new_labels[idx]
        except IndexError as e:
            new_labels.insert(idx, new_labels[idx-1])
        if t[:2] == "##":
            new_labels.insert(idx, new_labels[idx-1])        
        
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]
        new_labels = new_labels[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    labels = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    labels.append(tag2idx["-PAD-"])
    for i, token in enumerate(tokens_a):
        tokens.append(token)
        segment_ids.append(0)
        labels.append(new_labels[i])
    labels.append(tag2idx["-PAD-"])
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    while len(labels) < max_seq_length:
        labels.append(tag2idx["-PAD-"])

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(labels) == max_seq_length
    
    return input_ids, input_mask, segment_ids, labels

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels_arr, shapetags_arr = [], [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, labels = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels_arr.append(labels)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array([to_categorical(i, num_classes=n_tags) for i in labels_arr]),
    )

def convert_text_to_examples(texts, labels_arr):
    """Create InputExamples"""
    InputExamples = []
    for text, labels in zip(texts, labels_arr):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None,
                         labels=labels)
        )
    return InputExamples


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import deeppavlov
from deeppavlov.core.data.utils import download_decompress
download_decompress('http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz', 'data/')

2020-03-02 15:59:17.228 INFO in 'deeppavlov.core.data.utils'['utils'] at line 80: Downloading from http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz to data/conll2003_v2.tar.gz
100%|██████████| 957k/957k [00:03<00:00, 274kB/s] 
2020-03-02 15:59:20.724 INFO in 'deeppavlov.core.data.utils'['utils'] at line 237: Extracting data/conll2003_v2.tar.gz archive into data


In [2]:
from deeppavlov.dataset_readers.conll2003_reader import Conll2003DatasetReader
dataset = Conll2003DatasetReader().read('data/')

In [3]:
train_words, train_tags = [], []
for tpl in dataset['train']:
    train_words.append(tpl[0])
    train_tags.append(tpl[1])

In [4]:
tags = set([])

for ts in train_tags:
  for i in ts:
    tags.add(i)
tags = list(tags)
tag2idx = {t: i+1 for i, t in enumerate(list(tags))}
tag2idx["-PAD-"] = 0 # for the mask zero
n_tags = len(tag2idx)

In [5]:
import tensorflow as tf
# import tensorflow.compat.v1 as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from keras import backend as K

# Initialize session
sess = tf.Session()
# sess = tf.compat.v1.Session()

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 256

In [6]:
train_tag_ids = [list(map(lambda x: tag2idx[x], sample)) for sample in train_tags]

In [7]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_words, train_tag_ids)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_tag_ids 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore






Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Converting examples to features', max=14041.0, style=Prog…




In [8]:
from keras.layers import Layer

class BertLayer(Layer):
    def __init__(self, n_fine_tune_layers=10, mask_zero=False, trainable=True, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = trainable
        self.output_size = 768
        self.mask_zero=mask_zero
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        
        # TRAINABLE PARAMS: TODO: Test that if have time
#         trainable_vars = self.bert.variables
        
        # Remove unused layers
#         trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        
        # Select how many layers to fine tune
#         trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
#         
        # Add to trainable weights
#         for var in trainable_vars:
#             print(var)
#             self._trainable_weights.append(var)
        
         # Remove unused layers and set trainable parameters
        self.trainable_weights += [var for var in self.bert.variables
                                   if not "/cls/" in var.name and not "/pooler/" in var.name][-self.n_fine_tune_layers :]

        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self.trainable_weights:
                self.non_trainable_weights.append(var)
                
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(input_ids=input_ids, input_mask=input_mask,
                           segment_ids=segment_ids)
        result = self.bert(inputs=bert_inputs, signature="tokens",
                           as_dict=True)["sequence_output"]
        result = K.reshape(result, (-1,inputs[0].shape[1],768))
        return result

    def compute_output_shape(self, input_shape):
        return (None, input_shape[0][1], self.output_size)
      
    def compute_mask(self, inputs, mask=None):
      input_ids, input_mask, segment_ids = inputs
      if not self.mask_zero:
          return None
      return K.not_equal(input_ids, 0)

In [9]:
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy, crf_viterbi_accuracy
from keras.models import Model, Input
from keras.layers import Embedding, Dense, Bidirectional, Dropout, LSTM, TimeDistributed

# Build model
def build_model(max_seq_length):  
    # Bert Embeddings
    in_id = Input(shape=(max_seq_length,), name="input_ids")
    in_mask = Input(shape=(max_seq_length,), name="input_masks")
    in_segment = Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    bert_output = BertLayer(n_fine_tune_layers=10, mask_zero=True, trainable=True)(bert_inputs)
    
    lstm = Bidirectional(LSTM(units=128, return_sequences=True))(bert_output)
    drop = Dropout(0.4)(lstm)
    dense = TimeDistributed(Dense(128, activation="relu"))(drop)
    crf = CRF(n_tags)
    out = crf(dense)
    model = Model(inputs=bert_inputs, outputs=out)
    model.compile(loss=crf.loss_function, optimizer='adam', metrics=[crf.accuracy])
    model.summary()
    
    
    return model

  
def initialize_vars(sess):
    K.get_session().run(tf.local_variables_initializer())
    K.get_session().run(tf.global_variables_initializer())
    K.get_session().run(tf.tables_initializer())

In [10]:
model = build_model(max_seq_length)

# Instantiate variables
initialize_vars(sess)













INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where








Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 256)          0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 256)          0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 256)          0                                            
__________________________________________________________________________________________________
bert_layer_1 (BertLayer)        (None, 256, 768)     110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]                
          




In [None]:
history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_tag_ids,
    validation_split=0.2,
    epochs=20,
    batch_size=128,
)

Train on 11232 samples, validate on 2809 samples
Epoch 1/20
 1152/11232 [==>...........................] - ETA: 1:21:53 - loss: 48.1310 - crf_viterbi_accuracy: 0.6890