In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import time
import numpy as np
import matplotlib.pyplot as plt

BUFFER_SIZE = 20000
BATCH_SIZE = 64
NUM_EPOCHS = 20
HEAD_NUM = 8
D_MODEL = 512
DFF = 2048
NUM_LAYERS = 6
DROPOUT_RATE = 0.1
VOCAB_SIZE_EN = 2**13
VOCAB_SIZE_PT = 2**13
MAX_LENGTH=40
tf.random.set_seed(1234)


In [2]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                              as_supervised=True)

train_dataset = examples['train']
val_dataset = examples['validation']


In [3]:
def encode(lang1, lang2):
  lang1 = tf.strings.unicode_split(lang1, 'UTF-8')
  lang2 = tf.strings.unicode_split(lang2, 'UTF-8')

  lang1 = vocab_pt.lookup(lang1)
  lang2 = vocab_en.lookup(lang2)

  return lang1, lang2

def tf_encode(pt, en):
    return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

vocab_pt = tf.keras.layers.experimental.preprocessing.StringLookup(
    max_tokens=VOCAB_SIZE_PT, mask_token=None)
vocab_en = tf.keras.layers.experimental.preprocessing.StringLookup(
    max_tokens=VOCAB_SIZE_EN, mask_token=None)

train_dataset = train_dataset.map(tf_encode)
train_dataset = train_dataset.filter(lambda x, y: tf.logical_and(tf.size(x) <= MAX_LENGTH, tf.size(y) <= MAX_LENGTH))
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_dataset.map(tf_encode)
val_dataset = val_dataset.filter(lambda x, y: tf.logical_and(tf.size(x) <= MAX_LENGTH, tf.size(y) <= MAX_LENGTH))


val_dataset = val_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))


In [4]:
def positional_encoding(position, d_model):
    angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))

    angle_rads = np.arange(position)[:, np.newaxis] * angle_rates[np.newaxis, :]
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) # 2i
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) # 2i+1

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)


In [5]:
def create_model(vocab_size, head_num, d_model, dff, num_layers, dropout_rate):
    inputs = tf.keras.layers.Input(shape=(None,))
    padding_mask = tf.keras.layers.Lambda(lambda x: tf.cast(tf.math.equal(x, 0), dtype=tf.float32))(inputs)

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))

    positional_encoding = positional_encoding(embeddings.shape[1], d_model)
    embeddings += positional_encoding

    outputs = embeddings
    for i in range(num_layers):
        outputs = encoder_layer(outputs, head_num, d_model, dff, dropout_rate, padding_mask)

    outputs = tf.keras.layers.Dense(vocab_size)(outputs)

    return tf.keras.Model(inputs=inputs, outputs=outputs)

def encoder_layer(inputs, head_num, d_model, dff, dropout_rate, padding_mask):
    attention = MultiHeadAttention(head_num, d_model)([inputs, inputs, inputs, padding_mask])
    attention = tf.keras.layers.Dropout(dropout_rate)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention)

    outputs = tf.keras.layers.Dense(dff, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(d_model)(outputs)
    outputs = tf.keras.layers.Dropout(dropout_rate)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs)

    return outputs

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, head_num, d_model):
        super(MultiHeadAttention, self).__init__()
        self.head_num = head_num
        self.d_model = d_model

        assert d_model % head_num == 0

        self.depth = d_model // head_num

        self.dense_q = tf.keras.layers.Dense(d_model)
        self.dense_k = tf.keras.layers.Dense(d_model)
        self.dense_v = tf.keras.layers.Dense(d_model)

        self.dense_output = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.head_num, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        q, k, v, padding_mask = inputs
        batch_size = tf.shape(q)[0]

        q = self.dense_q(q)
        k = self.dense_k(k)
        v = self.dense_v(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, padding_mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

        outputs = self.dense_output(concat_attention)

        return outputs


在模型实例化后，我们可以使用Adam优化器和SparseCategoricalCrossentropy损失函数对模型进行编译。


In [6]:
model = create_model(VOCAB_SIZE_PT, HEAD_NUM, D_MODEL, DFF, NUM_LAYERS, DROPOUT_RATE)
optimizer = tf.keras.optimizers.Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
model.compile(optimizer=optimizer, loss=loss_function)


UnboundLocalError: ignored

In [7]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

for epoch in range(NUM_EPOCHS):
  print('Epoch {}/{}'.format(epoch+1, NUM_EPOCHS))
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()
  val_loss.reset_states()
  val_accuracy.reset_states()

  for (batch, (inputs, targets)) in enumerate(train_dataset):
      with tf.GradientTape() as tape:
          predictions = model(inputs, training=True)
          loss = loss_function(targets, predictions)

      gradients = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))

      train_loss(loss)
      train_accuracy(targets, predictions)

      if batch % 50 == 0:
          print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch+1, batch, train_loss.result(), train_accuracy.result()))

  for (batch, (inputs, targets)) in enumerate(val_dataset):
      predictions = model(inputs, training=False)
      loss = loss_function(targets, predictions)

      val_loss(loss)
      val_accuracy(targets, predictions)

      if batch % 50 == 0:
        print('Epoch {} Validation Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch+1, batch, val_loss.result(), val_accuracy.result()))

  print('Epoch {} Loss {:.4f} Accuracy {:.4f} Validation Loss {:.4f} Validation Accuracy {:.4f}'.format(epoch+1, train_loss.result(), train_accuracy.result(), val_loss.result(), val_accuracy.result()))
  print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


Epoch 1/20


UnknownError: ignored

In [None]:
def evaluate(inp_sentence):
    start_token = [vocab_pt.vocab_size]
    end_token = [vocab_pt.vocab_size + 1]

    inp_sentence = start_token + vocab_pt.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)

    decoder_input = [vocab_en.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_LENGTH):
        predictions = model(inputs=[encoder_input, output], training=False)

        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == vocab_en.vocab_size+1:
            return tf.squeeze(output, axis=0)

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)

def translate(sentence):
    result = evaluate(sentence).numpy()

    predicted_sentence = vocab_en.decode([i for i in result if i < VOCAB_SIZE_EN])

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))
