Build a simplified Transformer Encoder using TensorFlow 2 and apply it to generate contextualized text embeddings for input tokens.

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
#Sample tokenized sentence
sample_input=tf.constant([[3,4,7,9,0,0]])
vocab_size=20
maxlen=6
embed_dim=64
ff_dim=128
num_heads=4

In [6]:
#Positional encoding layer
class PositionalEncoding(tf.keras.layers.Layer):
    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]
        embed_dim = tf.shape(inputs)[2]

        position = tf.cast(tf.range(seq_len)[:, tf.newaxis], tf.float32)
        i = tf.cast(tf.range(embed_dim)[tf.newaxis, :], tf.float32)

        angle_rates = 1 / tf.pow(10000.0, (2 * (i // 2)) / tf.cast(embed_dim, tf.float32))
        angle_rads = position * angle_rates

        # Apply sin to even indices and cos to odd indices
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])

        # Interleave sines and cosines
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = tf.expand_dims(pos_encoding, 0)  # Add batch dimension

        return inputs + pos_encoding


In [7]:
#Transformer encoder block
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()

    def call(self, x):
        attn_output = self.att(x, x)                         # Self-attention
        out1 = self.norm1(x + attn_output)                   # Add & norm
        ffn_output = self.ffn(out1)                          # Feed-forward network
        return self.norm2(out1 + ffn_output)                 # Add & norm


In [8]:
inputs = tf.keras.Input(shape=(maxlen,))
x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)   # Word embeddings
x = PositionalEncoding()(x)                                                         # Add positional encoding
x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x)                             # Transformer block
model = tf.keras.Model(inputs, x)


In [9]:
output_embeddings = model(sample_input)
print("Output Embedding Shape:", output_embeddings.shape)
print("Token Embedding for first token:", output_embeddings[0, 0, :5].numpy().round(3))

Output Embedding Shape: (1, 6, 64)
Token Embedding for first token: [-0.403 -0.525 -0.902  0.09  -0.777]
