In [None]:
import tensorflow as tf
import numpy as np
import os
import re
import string
import random
import time

# (mini) BERT

(Without the next sentence prediction task, and without the [CLS] AND [SEP] tokens)

There is a MultiHeadAttention layer from Keras (but only in [tf-nightly](https://www.tensorflow.org/api_docs/python/tf/keras/layers/MultiHeadAttention)) Let's try it in this code just for fun

In [None]:
!pip install tf-nightly

You must restart the runtime

In [None]:
import tensorflow as tf
import numpy as np
import os
import re
import string
import random
import time

## Preprocessing

In [None]:
embedding_dims = 128
num_heads = 8
expanded_dims = 256
num_layers = 1
batch_size = 32

In [None]:
import tensorflow_datasets as tfds

imdb_data = tfds.load(
    name="imdb_reviews", 
    split='train+test', batch_size=-1)
text_tf = imdb_data["text"]

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteN4RY9K/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteN4RY9K/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteN4RY9K/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


Remember how to use TextVectorization layer? Same as Lab3

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


vocabulary_size = 30000  # Only consider the top 30k words
max_len = 100 # Max sequence size

vectorize_layer = TextVectorization(standardize=custom_standardization,
                                    max_tokens=vocabulary_size,
                                    output_mode="int",
                                    output_sequence_length=max_len,
)
vectorize_layer.adapt(text_tf)
vocab = vectorize_layer.get_vocabulary()

Let's add the [MASK] token and remove the [UNK]

In [None]:
print(vocab[0])
print(vocab[1])


[UNK]


In [None]:
vocab = vocab[2:-1] + ["[mask]"]
vectorize_layer.set_vocabulary(vocab)
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]

If we want to load a dataset as an array of Tensors (in memory) use batch_size=-1

To transform the tensor to a numpy array simply use tfds.as_numpy() function

In [None]:
text = tfds.as_numpy(text_tf)

With numpy you can directly use the vectorize layer. It returns a tensor

In [None]:
text_tf = vectorize_layer(text)

Let's turn the tensor back to numpy. This time it is **not a dataset** but a simple tensor, so you can use the numpy() function

In [None]:
text_tokens = text_tf.numpy()

Why play all the time between tensors / datasets /numpy? The best is to keep it one from the beggining, however for complex preprocessing functions, everything needs to be a tf.xxx function. Easier to work with numpy arrays:

In [None]:
def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(encoded_texts.shape[0], 
                              encoded_texts.shape[1]) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(encoded_texts.shape[0], 
                                                encoded_texts.shape[1]) < 0.90)
    encoded_texts_masked[inp_mask_2mask] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(encoded_texts.shape[0], 
                                                         encoded_texts.shape[1]) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to loss function
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights

x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(text_tokens)

Let's bring it back to a Dataset! A nice (but complex) task would be to stay in Dataset from the beginning and transform the get_masked_input_and_labels to a tf function (without numpy operations!) otherwise you can keep the function as it is BUT you need to add a decorator and an [extra function] (https://www.tensorflow.org/api_docs/python/tf/numpy_function).

If the dataset is bigger than memory, you NEED to use only dataset operations.

In [None]:
mlm_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_train, y_masked_labels, sample_weights)
)
mlm_ds = mlm_ds.shuffle(1000).batch(batch_size)

## Modeling

Same functions as last lab!

In [None]:
def pointwise_ffn(embedding_dims, expanded_dims):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(expanded_dims, activation='relu'),  
      tf.keras.layers.Dense(embedding_dims)  
  ])

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, dimensions):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(dimensions)[np.newaxis, :],
                          dimensions)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

Introduce the BertLayer (Equivalent to the EncoderLayer from the Transformer)

In [None]:
class BertLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_dims, num_heads, expanding_dims, rate=0.1):
    super().__init__()

    self.mha = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim= embedding_dims // num_heads, # we called it depth in our tutorial
    )
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.ffn = pointwise_ffn(embedding_dims, expanding_dims)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x, training):

    attn_output = self.mha(x, x, x)  
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  
    
    ffn_output = self.ffn(out1) 
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)   # (batch_size, input_seq_len, embedding_dims)
    
    return out2

Almost identical to the Encoder in the Transformer from last week. Only difference is that the output is not the encoder embedding, but it goes through a Dense final layer

In [None]:
class Bert(tf.keras.Model):
  def __init__(self, num_layers, embedding_dims, num_heads, expanded_dims, vocabulary_size, rate=0.1):
    super().__init__()
    self.embedding_dims = embedding_dims
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(vocabulary_size, embedding_dims)
    self.pos_encoding = positional_encoding(max_len, embedding_dims)
    self.bert_layers = [BertLayer(embedding_dims, num_heads, expanded_dims, rate) 
                       for i in range(num_layers)]
    self.mlm_output = tf.keras.layers.Dense(vocabulary_size, activation="softmax")
    
  def call(self, input, training):

    encoder_output = self.embedding(input) + self.pos_encoding
    
    for i in range(self.num_layers):
        encoder_output = self.bert_layers[i](encoder_output, training)

    final_output = self.mlm_output(encoder_output)
    
    return final_output

bert = Bert(num_layers, embedding_dims, num_heads, expanded_dims, vocabulary_size)

In [None]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(0.001)

Using the sample weights in the loss is waay simpler than what we did in the lab 5 and 7 where we created the mask ourselves with zeros and ones. However, more work in the preprocessing step. Your choice!

In [None]:
EPOCHS = 4
steps_per_epoch = len(text) // batch_size

for epoch in range(EPOCHS):
  start = time.time()
  epoch_loss = 0

  for (batch, (input, labels, weights)) in enumerate(mlm_ds.take(steps_per_epoch)):
    
      with tf.GradientTape() as tape:
          predictions = bert(input, training=True)
          batch_loss = loss_function(labels, predictions, sample_weight=weights)


      gradients = tape.gradient(batch_loss, bert.trainable_variables)    
      optimizer.apply_gradients(zip(gradients, bert.trainable_variables))
      epoch_loss += batch_loss  

      if batch % 30 == 0:
        print (f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy()}')
    
  print ('Epoch {} Loss {:.4f}'.format(epoch + 1, 
                                                epoch_loss / steps_per_epoch))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.3119986057281494
Epoch 1 Batch 30 Loss 1.0304683446884155
Epoch 1 Batch 60 Loss 0.893552839756012
Epoch 1 Batch 90 Loss 0.964191198348999
Epoch 1 Batch 120 Loss 0.9507972598075867
Epoch 1 Batch 150 Loss 1.0428802967071533
Epoch 1 Batch 180 Loss 0.9928719401359558
Epoch 1 Batch 210 Loss 1.0061718225479126
Epoch 1 Batch 240 Loss 0.9237513542175293
Epoch 1 Batch 270 Loss 0.9374967813491821
Epoch 1 Batch 300 Loss 1.0467067956924438
Epoch 1 Batch 330 Loss 1.0524653196334839
Epoch 1 Batch 360 Loss 0.9583702087402344
Epoch 1 Batch 390 Loss 1.003765344619751
Epoch 1 Batch 420 Loss 0.8757596015930176
Epoch 1 Batch 450 Loss 0.9699009656906128
Epoch 1 Batch 480 Loss 0.9271793961524963
Epoch 1 Batch 510 Loss 1.000502109527588
Epoch 1 Batch 540 Loss 0.9517737030982971
Epoch 1 Batch 570 Loss 0.9355146884918213
Epoch 1 Batch 600 Loss 0.8446342349052429
Epoch 1 Batch 630 Loss 0.9135710000991821
Epoch 1 Batch 660 Loss 0.8232740759849548
Epoch 1 Batch 690 Loss 0.9487752318382263
E

In [None]:
sentence = "I have watched this [mask] and it was awesome"
sample_tokens = vectorize_layer([sentence])

In [None]:
prediction = bert.predict(sample_tokens)

In [None]:
masked_index = np.where(sample_tokens == mask_token_id)
masked_index = masked_index[1]
mask_prediction = prediction[0][masked_index]

In [None]:
top_indices = mask_prediction[0].argsort()[-5 :][::-1]

In [None]:
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))

In [None]:
for t in top_indices:
    print(id2token[t])

movie
film
one
show
review
