In [1]:
import tensorflow as tf

class BertLikeModel(tf.keras.Model):
    def __init__(self, vocab_size, hidden_size=768, num_layers=12, num_heads=12, intermediate_size=3072, max_position_embeddings=512, num_token_types=2, dropout_rate=0.1):
        super(BertLikeModel, self).__init__()

        self.embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=hidden_size, mask_zero=True)
        self.position_embedding_layer = tf.keras.layers.Embedding(input_dim=max_position_embeddings, output_dim=hidden_size)
        self.token_type_embedding_layer = tf.keras.layers.Embedding(input_dim=num_token_types, output_dim=hidden_size)

        self.encoder_layers = []
        for _ in range(num_layers):
            self_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_size // num_heads, dropout=dropout_rate)
            intermediate_dense = tf.keras.layers.Dense(units=intermediate_size, activation='gelu')
            output_dense = tf.keras.layers.Dense(units=hidden_size)
            layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-12)
            layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-12)
            dropout1 = tf.keras.layers.Dropout(rate=dropout_rate)
            dropout2 = tf.keras.layers.Dropout(rate=dropout_rate)

            encoder_layer = tf.keras.Sequential([
                self_attention,
                dropout1,
                layer_norm1,
                tf.keras.layers.Add(),
                intermediate_dense,
                output_dense,
                dropout2,
                layer_norm2,
                tf.keras.layers.Add()
            ])
            self.encoder_layers.append(encoder_layer)

        self.pooler = tf.keras.layers.Dense(units=hidden_size, activation='tanh')

    def call(self, inputs, token_type_ids=None, training=False):
        mask = tf.math.not_equal(inputs, 0)
        embeddings = self.embedding_layer(inputs)
        position_embeddings = self.position_embedding_layer(tf.range(tf.shape(inputs)[-1]))
        if token_type_ids is not None:
            token_type_embeddings = self.token_type_embedding_layer(token_type_ids)
        else:
            token_type_embeddings = 0
        embeddings = embeddings + position_embeddings + token_type_embeddings

        for encoder_layer in self.encoder_layers:
            embeddings = encoder_layer(embeddings, training=training)

        pooled_output = self.pooler(embeddings[:, 0, :])  # Taking [CLS] token's representation for pooling

        return pooled_output

# Create an instance of the BertLikeModel
vocab_size = 30522  # Example vocabulary size
bert_model = BertLikeModel(vocab_size)

# Example input
input_ids = tf.constant([[101, 123, 456, 789, 102], [101, 234, 567, 890, 102]], dtype=tf.int32)
token_type_ids = tf.constant([[0, 0, 0, 0, 0], [0, 1, 1, 1, 1]], dtype=tf.int32)  # Example token type IDs

# Obtain model outputs
outputs = bert_model(input_ids, token_type_ids)

print("Model Output Shape:", outputs.shape)


TypeError: Exception encountered when calling layer 'sequential' (type Sequential).

MultiHeadAttention.call() missing 1 required positional argument: 'value'

Call arguments received by layer 'sequential' (type Sequential):
  • inputs=tf.Tensor(shape=(2, 5, 768), dtype=float32)
  • training=False
  • mask=None