In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/spoken_language_understanding/atis/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%tensorflow_version 2.x
!pip install tensorflow-addons



In [3]:
from tensorflow_addons.optimizers.cyclical_learning_rate import Triangular2CyclicalLearningRate
from sklearn.metrics import classification_report, f1_score, accuracy_score

import tensorflow as tf
import pprint
import logging
import time
import numpy as np

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.3.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [4]:
def get_vocab(vocab_path):
  word2idx = {}
  with open(vocab_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx

In [5]:
def data_generator(f_path, params):
  print('Reading', f_path)
  with open(f_path) as f:
    for line in f:
      line = line.rstrip()
      text, slot_intent = line.split('\t')
      words = text.split()[1:-1]
      slot_intent = slot_intent.split()
      slots, intent = slot_intent[1:-1], slot_intent[-1]
      assert len(words) == len(slots)
      
      words = [params['word2idx'].get(w, len(params['word2idx'])) for w in words]
      intent = params['intent2idx'].get(intent, len(params['intent2idx']))
      slots = [params['slot2idx'].get(s, len(params['slot2idx'])) for s in slots]
      
      yield (words, (intent, slots))

In [6]:
def dataset(is_training, params):
  _shapes = ([None], ((), [None]))
  _types = (tf.int32, (tf.int32, tf.int32))
  _pads = (0, (-1, 0))
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['num_samples'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [7]:
def get_timing_signal_1d(length,
                         channels,
                         min_timescale=1.0,
                         max_timescale=1.0e4,
                         start_index=0):
  to_float = lambda x: tf.cast(x, tf.float32)
  position = to_float(tf.range(length) + start_index)
  num_timescales = channels // 2
  log_timescale_increment = (
      tf.math.log(float(max_timescale) / float(min_timescale)) /
      tf.maximum(to_float(num_timescales) - 1, 1))
  inv_timescales = min_timescale * tf.exp(
      to_float(tf.range(num_timescales)) * -log_timescale_increment)
  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
  signal = tf.pad(signal, [[0, 0], [0, tf.compat.v1.mod(channels, 2)]])
  signal = tf.reshape(signal, [1, length, channels])
  return signal

In [8]:
class LayerNorm(tf.keras.layers.Layer):
  def __init__(self, params):
    super().__init__()
    self._epsilon = params['epsilon']
    self._hidden_units = params['global_units']
  
  def build(self, input_shape):
    self.scale = self.add_weight(name='scale',
                                 shape=[self._hidden_units],
                                 initializer=tf.ones_initializer(),
                                 trainable=True)
    self.bias = self.add_weight(name='bias',
                                shape=[self._hidden_units],
                                initializer=tf.zeros_initializer(),
                                trainable=True)
    super().build(input_shape)
  
  def call(self, inputs):
    mean, variance = tf.nn.moments(inputs, [-1], keepdims=True)
    norm_x = (inputs - mean) * tf.math.rsqrt(variance + self._epsilon)
    return norm_x * self.scale + self.bias
  
  def compute_output_shape(self, input_shape):
    return input_shape


class EncoderBlock(tf.keras.Model):
  def __init__(self, SubModel, params, name):
    super().__init__(name = name)
    self.layer_norm = LayerNorm(params)
    self.sub_model = SubModel(params)
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
  
  def call(self, inputs, training):
    inputs, masks = inputs
    x = self.layer_norm(inputs)
    x = self.sub_model((x, masks), training=training)
    x = self.dropout(x, training=training)
    x += inputs
    return x


class MultiheadSelfAttention(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.qkv_linear = tf.keras.layers.Dense(3*params['hidden_units'], name='qkv_linear')
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.out_linear = tf.keras.layers.Dense(params['global_units'], params['activation'], name='out_linear')
    self.num_heads = params['num_heads']
    self.is_bidirectional = params['is_bidirectional']
  
  def call(self, inputs, training):
    x, masks = inputs
    batch_sz = tf.shape(x)[0]
    timesteps = tf.shape(x)[1]
    
    x_left = tf.concat((tf.zeros((batch_sz, 1, params['global_units'])), x[:, :-1, :]), axis=1)
    x_right = tf.concat((x[:, 1:, :], tf.zeros((batch_sz, 1, params['global_units']))), axis=1)
    x = tf.concat((x_left[:, :, :100], x[:, :, 100:200], x_right[:, :, 200:]), axis=-1)

    q_k_v = self.qkv_linear(x)
    q, k, v = tf.split(q_k_v, 3, axis=-1)
    
    if self.num_heads > 1:
      q = tf.concat(tf.split(q, self.num_heads, axis=2), axis=0)                        
      k = tf.concat(tf.split(k, self.num_heads, axis=2), axis=0)                        
      v = tf.concat(tf.split(v, self.num_heads, axis=2), axis=0)
    
    align = tf.matmul(q, k, transpose_b=True)
    align *= tf.math.rsqrt(tf.cast(k.shape[-1], tf.float32))
    
    if (masks is not None) or (not self.is_bidirectional):
      paddings = tf.fill(tf.shape(align), float('-inf'))
    
    if masks is not None:
      c_masks = tf.tile(masks, [params['num_heads'], 1])
      c_masks = tf.tile(tf.expand_dims(c_masks, 1), [1, timesteps, 1])
      align = tf.where(tf.equal(c_masks, 0), paddings, align)
    
    if not self.is_bidirectional:
      lower_tri = tf.ones((timesteps, timesteps))                                       
      lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense()      
      t_masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1])     
      align = tf.where(tf.equal(t_masks, 0), paddings, align)
    
    align = tf.nn.softmax(align)
    align = self.dropout(align, training=training)
    
    if masks is not None:
      q_masks = tf.tile(masks, [params['num_heads'], 1])
      q_masks = tf.tile(tf.expand_dims(q_masks, 2), [1, 1, timesteps])
      align *= tf.cast(q_masks, tf.float32)
    
    x = tf.matmul(align, v)
    if self.num_heads > 1:
      x = tf.concat(tf.split(x, self.num_heads, axis=0), axis=2)
    x = self.out_linear(x)
    
    return x
  

class PointwiseFFN(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.dense_1 = tf.keras.layers.Dense(params['multiplier']*params['global_units'], params['activation'], name='fc')
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.dense_2 = tf.keras.layers.Dense(params['global_units'], name='linear')
  
  def call(self, inputs, training):
    x, masks = inputs
    return self.dense_2(self.dropout(self.dense_1(x), training=training))

In [9]:
class Model(tf.keras.Model):
  def __init__(self, params: dict):
    super().__init__()
    self.embedding = tf.Variable(np.load(params['vocab_path']),
                                 dtype=tf.float32,
                                 name='pretrained_embedding')
    self.input_dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    
    self.blocks = []
    for i in range(params['num_layers']):
      self.blocks.append(EncoderBlock(
          MultiheadSelfAttention, params, name='layer{}.1'.format(i+1)))
      self.blocks.append(EncoderBlock(
          PointwiseFFN, params, name='layer{}.2'.format(i+1)))
    
    self.intent_dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.fc_intent = tf.keras.layers.Dense(params['global_units'], params['activation'], name='fc_intent')
    self.out_linear_intent = tf.keras.layers.Dense(params['intent_size'], name='output_intent')
    self.out_linear_slot = tf.keras.layers.Dense(params['slot_size'], name='output_slot')
    
  
  def call(self, inputs, training):
    if inputs.dtype != tf.int32:
      inputs = tf.cast(inputs, tf.int32)
    masks = tf.sign(inputs)
    
    x = tf.nn.embedding_lookup(self.embedding, inputs)
    if params['is_embedding_scaled']:
      x *= tf.sqrt(tf.cast(params['global_units'], tf.float32))
    x += get_timing_signal_1d(tf.shape(x)[1], params['global_units'])
    x = self.input_dropout(x, training=training)
    
    for block in self.blocks:
      x = block((x, masks), training=training)
    
    x_intent = tf.reduce_max(x, 1)
    x_intent = self.intent_dropout(x_intent, training=training)
    x_intent = self.out_linear_intent(self.fc_intent(x_intent))
    x_slot = self.out_linear_slot(x)
    return (x_intent, x_slot)

In [10]:
params = {
  'train_path': '../data/atis.train.w-intent.iob',
  'test_path': '../data/atis.test.w-intent.iob',
  'word_path': '../vocab/word.txt',
  'vocab_path': '../vocab/word.npy',
  'intent_path': '../vocab/intent.txt',
  'slot_path': '../vocab/slot.txt',
  'batch_size': 16,
  'num_samples': 4978,
  'num_layers': 2,
  'global_units': 300,
  'hidden_units': 512,
  'activation': tf.nn.elu,
  'num_heads': 8,
  'multiplier': 2,
  'dropout_rate': .1,
  'epsilon': 1e-6,
  'is_bidirectional': True,
  'is_embedding_scaled': False,
  'clip_norm': 5.,
}

In [11]:
params['word2idx'] = get_vocab(params['word_path'])
params['intent2idx'] = get_vocab(params['intent_path'])
params['slot2idx'] = get_vocab(params['slot_path'])

params['word_size'] = len(params['word2idx']) + 1
params['intent_size'] = len(params['intent2idx']) + 1
params['slot_size'] = len(params['slot2idx']) + 1

In [12]:
model = Model(params)
model.build(input_shape=(None, None))
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

decay_lr = Triangular2CyclicalLearningRate(
  initial_learning_rate = 1e-4,
  maximal_learning_rate = 8e-4,
  step_size = 8 * params['num_samples'] // params['batch_size'],
)
optim = tf.optimizers.Adam(1e-4)
global_step = 0

slot_best_f1 = .0
intent_acc_with_that = .0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

for n_epoch in range(1, 64+1):
  # TRAINING
  for (words, (intent, slots)) in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      y_intent, y_slots = model(words, training=True)
      loss_intent = tf.compat.v1.losses.softmax_cross_entropy(
        onehot_labels = tf.one_hot(intent, len(params['intent2idx'])+1),
        logits = y_intent,
        label_smoothing = .2)
      # weight of 'O' is set to be small
      weights = tf.cast(tf.sign(slots), tf.float32)
      padding = tf.constant(1e-2, tf.float32, weights.shape)
      weights = tf.where(tf.equal(weights, 0.), padding, weights)

      loss_slots = tf.compat.v1.losses.softmax_cross_entropy(
        onehot_labels = tf.one_hot(slots, len(params['slot2idx'])+1),
        logits = y_slots,
        weights = tf.cast(weights, tf.float32),
        label_smoothing = .2)
      # joint loss
      loss = loss_intent + loss_slots
    
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, model.trainable_variables))

    if global_step % 50 == 0:
      logger.info("Step {} | Loss: {:.4f} | Loss_intent: {:.4f} | Loss_slots: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), loss_intent.numpy().item(), loss_slots.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
    
  # EVALUATION
  intent_true = []
  intent_pred = []
  slot_true = []
  slot_pred = []

  for (words, (intent, slots)) in dataset(is_training=False, params=params):
    y_intent, y_slots = model(words, training=False)
    y_intent = tf.argmax(y_intent, -1)
    y_slots = tf.argmax(y_slots, -1)
    
    intent_true += intent.numpy().flatten().tolist()
    intent_pred += y_intent.numpy().flatten().tolist()
    slot_true += slots.numpy().flatten().tolist()
    slot_pred += y_slots.numpy().flatten().tolist()
    
  f1_slots = f1_score(y_true = slot_true,
                      y_pred = slot_pred,
                      labels = list(params['slot2idx'].values()),
                      sample_weight = np.sign(slot_true),
                      average='micro',)
  
  acc_intent = accuracy_score(intent_true, intent_pred)

  logger.info("Slot F1: {:.3f}, Intent Acc: {:.3f}".format(f1_slots, acc_intent))

  if n_epoch != 1 and n_epoch % 8 == 0:
    logger.info('\n'+classification_report(y_true = intent_true,
                                          y_pred = intent_pred,
                                          labels = list(params['intent2idx'].values()),
                                          target_names = list(params['intent2idx'].keys()),
                                          digits=3))
    logger.info('\n'+classification_report(y_true = slot_true,
                                          y_pred = slot_pred,
                                          labels = list(params['slot2idx'].values()),
                                          target_names = list(params['slot2idx'].keys()),
                                          sample_weight = np.sign(slot_true),
                                          digits=3))
  
  if f1_slots > slot_best_f1:
    slot_best_f1 = f1_slots
    intent_acc_with_that = acc_intent
    # you can save model here
  logger.info("Best Slot F1: {:.3f}, Intent Acc: {:.3f}".format(slot_best_f1, intent_acc_with_that))

[('layer1.1/layer_norm/scale:0', TensorShape([300])),
 ('layer1.1/layer_norm/bias:0', TensorShape([300])),
 ('layer1.1/multihead_self_attention/qkv_linear/kernel:0',
  TensorShape([300, 1536])),
 ('layer1.1/multihead_self_attention/qkv_linear/bias:0', TensorShape([1536])),
 ('layer1.1/multihead_self_attention/out_linear/kernel:0',
  TensorShape([512, 300])),
 ('layer1.1/multihead_self_attention/out_linear/bias:0', TensorShape([300])),
 ('layer1.2/layer_norm_1/scale:0', TensorShape([300])),
 ('layer1.2/layer_norm_1/bias:0', TensorShape([300])),
 ('layer1.2/pointwise_ffn/fc/kernel:0', TensorShape([300, 600])),
 ('layer1.2/pointwise_ffn/fc/bias:0', TensorShape([600])),
 ('layer1.2/pointwise_ffn/linear/kernel:0', TensorShape([600, 300])),
 ('layer1.2/pointwise_ffn/linear/bias:0', TensorShape([300])),
 ('layer2.1/layer_norm_2/scale:0', TensorShape([300])),
 ('layer2.1/layer_norm_2/bias:0', TensorShape([300])),
 ('layer2.1/multihead_self_attention_1/qkv_linear/kernel:0',
  TensorShape([300, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


INFO:tensorflow:Step 2500 | Loss: 1.4113 | Loss_intent: 1.1038 | Loss_slots: 0.3075 | Spent: 5.7 secs | LR: 0.000797
INFO:tensorflow:Step 2550 | Loss: 1.5920 | Loss_intent: 1.0999 | Loss_slots: 0.4920 | Spent: 3.0 secs | LR: 0.000783
INFO:tensorflow:Step 2600 | Loss: 1.5342 | Loss_intent: 1.1589 | Loss_slots: 0.3753 | Spent: 3.0 secs | LR: 0.000769
INFO:tensorflow:Step 2650 | Loss: 1.5055 | Loss_intent: 1.1447 | Loss_slots: 0.3608 | Spent: 3.0 secs | LR: 0.000755
INFO:tensorflow:Step 2700 | Loss: 1.4146 | Loss_intent: 1.0957 | Loss_slots: 0.3189 | Spent: 2.9 secs | LR: 0.000741
INFO:tensorflow:Step 2750 | Loss: 1.4632 | Loss_intent: 1.1117 | Loss_slots: 0.3515 | Spent: 3.0 secs | LR: 0.000727
INFO:tensorflow:Step 2800 | Loss: 1.5773 | Loss_intent: 1.1003 | Loss_slots: 0.4770 | Spent: 3.1 secs | LR: 0.000713
Reading ../data/atis.test.w-intent.iob
INFO:tensorflow:Slot F1: 0.917, Intent Acc: 0.969
INFO:tensorflow:Best Slot F1: 0.922, Intent Acc: 0.965
Reading ../data/atis.train.w-intent.i