In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/multi_turn_rewrite/chinese_tagging/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%tensorflow_version 2.x
!pip install transformers
!pip install tensorflow_addons



In [3]:
from pathlib import Path
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, TFBertModel

import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
import pprint
import logging
import time

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.2.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [4]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip('\n')
      word2idx[line] = i
  return word2idx

In [5]:
params = {
  'pretrain_path': 'bert-base-chinese',
  'train_paths': ['../data/train_pos_tag.txt', '../data/train_neg_tag.txt'],
  'test_paths_1': ['../data/test_pos_tag.txt'],
  'test_paths_2': ['../data/test_pos_tag.txt', '../data/test_neg_tag.txt'],
  'vocab_path': '../vocab/char.txt',
  'batch_size': 16,
  'clip_norm': 5.,
  'buffer_size': 18986 * 3,
  'init_lr': 1e-5,
  'max_lr': 4e-5,
  'n_epochs': 16,
}

tokenizer = BertTokenizer.from_pretrained(params['pretrain_path'],
                                          lowercase = True,
                                          add_special_tokens = True)

In [6]:
def data_generator(f_paths, params):
  for f_path in f_paths:
    with open(f_path) as f:
      print('Reading', f_path)
      for line in f:
        line = line.rstrip()
        h1, h2, q, intent, t1, t2 = line.split('\t')
        h1, h2, q = list(h1), list(h2), list(q)
        text = ['[CLS]'] + h1 + ['[SEP]'] + h2 + ['[SEP]'] + q + ['[SEP]']
        seg = [0] + [0] * len(h1) + [0] + [0] * len(h2) + [0] + [1] * len(q) + [1]
        t1 = [int(t) for t in t1]
        t2 = [int(t) for t in t2]
        label = [int(intent)] + t1 + [0] + t2 + [0] + [0] * len(q) + [0]
        text = tokenizer.convert_tokens_to_ids(text)
        yield (text, seg), label


def dataset(paths, is_training, params):
  _shapes = (([None], [None]), [None])
  _types = ((tf.int32, tf.int32), tf.int32)
  _pads = ((0, 0), 0)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(paths, params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(paths, params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [7]:
(text, seg), label = next(data_generator(params['train_paths'], params))
print(text)
print(seg)
print(label)

Reading ../data/train_pos_tag.txt
[101, 5543, 5314, 2769, 5041, 1399, 1408, 102, 1139, 683, 6782, 1086, 6379, 102, 2769, 4385, 1762, 2218, 6206, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [8]:
class BertFinetune(tf.keras.Model):
  def __init__(self, params):
    super(BertFinetune, self).__init__()
    self.bert = TFBertModel.from_pretrained(params['pretrain_path'],
                                            trainable = True)
    
    self.drop_1_labels = tf.keras.layers.Dropout(.1)
    self.drop_1_intent = tf.keras.layers.Dropout(.1)

    self.fc_labels = tf.keras.layers.Dense(300, tf.nn.swish, name='down_stream/fc_labels')
    self.fc_intent = tf.keras.layers.Dense(300, tf.nn.swish, name='down_stream/fc_intent')

    self.drop_2_labels = tf.keras.layers.Dropout(.1)
    self.drop_2_intent = tf.keras.layers.Dropout(.1)

    self.out_labels = tf.keras.layers.Dense(1, name='down_stream/out_labels')
    self.out_intent = tf.keras.layers.Dense(3, name='down_stream/out_intent')

  def call(self, bert_inputs, training):
    bert_inputs = [tf.cast(inp, tf.int32) for inp in bert_inputs]
    x = self.bert(bert_inputs, training=training)
    x_intent = x[1]
    x_labels = x[0][:, 1:, :]

    x_intent = self.drop_1_intent(x_intent, training=training)
    x_labels = self.drop_1_labels(x_labels, training=training)

    x_intent = self.fc_intent(x_intent)
    x_labels = self.fc_labels(x_labels)

    x_intent = self.drop_2_intent(x_intent, training=training)
    x_labels = self.drop_2_labels(x_labels, training=training)

    x_intent = self.out_intent(x_intent)
    x_labels = self.out_labels(x_labels)

    x_labels = tf.squeeze(x_labels, -1)
    return x_intent, x_labels

In [9]:
def unit_test(model):
  h1 = '成都房价是多少'
  h2 = '不买就后悔了成都房价还有上涨空间'
  q = '买不起'
  text_ = ['[CLS]'] + list(h1) + ['[SEP]'] + list(h2) + ['[SEP]'] + list(q) + ['[SEP]']
  text = tf.convert_to_tensor([tokenizer.convert_tokens_to_ids(text_)])
  seg = tf.convert_to_tensor([[0] + [0] * len(h1) + [0] + [0] * len(h2) + [0] + [1] * len(q) + [1]])

  logits_intent, logits_labels = model([text, tf.sign(text), seg], training=False)
  scores_labels = tf.sigmoid(logits_labels)
  scores_labels = tf.cast(tf.math.greater_equal(scores_labels, .5), tf.int32)
  scores_labels = scores_labels[0].numpy()
  scores_intent = tf.argmax(logits_intent, -1)[0].numpy()
  print('-'*12)
  print('unit test')
  print('Query:')
  print(' '.join(text_))
  print('[CLS]:', scores_intent)
  print('Retrieved:')
  str_out = []
  for i, val in enumerate(scores_labels):
    if val == 1:
      str_out += [text_[i]]
  print(' '.join(str_out))
  print('-'*12)

In [None]:
model = BertFinetune(params)
model.build([[None, None], [None, None], [None, None]])
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

step_size = 2 * params['buffer_size'] // params['batch_size']
decay_lr = tfa.optimizers.Triangular2CyclicalLearningRate(
  initial_learning_rate = params['init_lr'],
  maximal_learning_rate = params['max_lr'],
  step_size = step_size,)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0

best_em = .0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

unit_test(model)

for _ in range(params['n_epochs']):
  # TRAINING
  for ((text, seg), labels) in dataset(params['train_paths'], is_training=True, params=params):
    with tf.GradientTape() as tape:
      masks = tf.sign(text)
      logits_intent, logits_labels = model([text, masks, seg], training=True)

      intents = labels[:, 0]
      labels = tf.cast(labels[:, 1:], tf.float32)
      masks = tf.cast((masks - seg)[:, 1:], tf.float32)

      loss_intent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=intents, logits=logits_intent)
      loss_intent = tf.reduce_mean(loss_intent)
      loss_labels = tf.nn.weighted_cross_entropy_with_logits(labels=labels, logits=logits_labels, pos_weight=2.)
      loss_labels = tf.reduce_sum(loss_labels * masks) / tf.reduce_sum(masks)
      loss = loss_intent + loss_labels
      
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, model.trainable_variables))
    
    if global_step % 100 == 0:
      logger.info("Step {} | Loss: {:.4f} | Loss_intent: {:.4f} | Loss_labels: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), loss_intent.numpy().item(), loss_labels.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
  
  # EVALUATION
  unit_test(model)

  logger.info("Evaluation on Positive Testing Examples")
  m = tf.keras.metrics.Accuracy()
  l = []
  p = []
  em = []

  for ((text, seg), labels) in dataset(params['test_paths_1'], is_training=False, params=params):
    logits_intent, logits_labels = model([text, tf.sign(text), seg], training=False)
    scores_labels = tf.sigmoid(logits_labels) * tf.cast(1 - seg[:, 1:], tf.float32)
    scores_labels = tf.cast(tf.math.greater_equal(scores_labels, .5), tf.int32).numpy()
    labels = labels.numpy()
    intents = labels[:, 0]
    labels = labels[:, 1:]
    l += labels.flatten().tolist()
    p += scores_labels.flatten().tolist()
    em += [np.array_equal(score, label) for score, label in zip(scores_labels, labels)]
    scores_intent = tf.argmax(logits_intent, -1, output_type=tf.dtypes.int32)
    m.update_state(y_true=intents, y_pred=scores_intent)
  
  assert len(l) == len(p)
  recall = accuracy_score(l, p, sample_weight=l)
  precision = accuracy_score(l, p, sample_weight=p)
  em = np.asarray(em).mean()

  logger.info("Labels:: Recall: {:.3f} | Precision: {:.3f} | EM: {:.3f}".format(recall, precision, em))
  logger.info("Intents: Accuracy: {:.3f}".format(m.result().numpy()))

  if em > best_em:
    best_em = em
  logger.info("Best EM: {:.3f}".format(best_em))

  m = tf.keras.metrics.Accuracy()
  logger.info("Evaluation on Positive + Negative Testing Examples")
  for ((text, seg), labels) in dataset(params['test_paths_2'], is_training=False, params=params):
    logits_intent, logits_labels = model([text, tf.sign(text), seg], training=False)
    scores_intent = tf.argmax(logits_intent, -1, output_type=tf.dtypes.int32)
    labels = labels.numpy()
    intents = labels[:, 0]
    m.update_state(y_true=intents, y_pred=scores_intent)
  logger.info("Intent:: Accuracy: {:.3f}".format(m.result().numpy()))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


[('tf_bert_model/bert/embeddings/word_embeddings/weight:0',
  TensorShape([21128, 768])),
 ('tf_bert_model/bert/embeddings/position_embeddings/embeddings:0',
  TensorShape([512, 768])),
 ('tf_bert_model/bert/embeddings/token_type_embeddings/embeddings:0',
  TensorShape([2, 768])),
 ('tf_bert_model/bert/embeddings/LayerNorm/gamma:0', TensorShape([768])),
 ('tf_bert_model/bert/embeddings/LayerNorm/beta:0', TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/value/bias:0',
  TensorShape([768]