In [1]:
import tensorflow as tf
import keras

In [2]:
# a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
# b = tf.constant([2,2], tf.int32)
# tf.tile(a, b)

In [3]:
import tensorflow as tf
from tensorflow import keras

class SelfAttention(keras.Model):
    def __init__(self, embed_size, head):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.head = head
        self.head_dim = embed_size // head

        assert (self.embed_size == self.head * self.head_dim), "Embedding size needs to be divisible by heads"

        self.values = keras.layers.Dense(units=embed_size)
        self.keys = keras.layers.Dense(units=embed_size)
        self.query = keras.layers.Dense(units=embed_size)
        self.fc = keras.layers.Dense(units=embed_size)

    def call(self, values, keys, query, mask):
        N = tf.shape(query)[0]
        value_len, key_len, query_len = tf.shape(values)[1], tf.shape(keys)[1], tf.shape(query)[1]

        # Linear transformations
        values = self.values(values)  # shape: (N, value_len, embed_size)
        keys = self.keys(keys)        # shape: (N, key_len, embed_size)
        queries = self.query(query)   # shape: (N, query_len, embed_size)

        # Reshape for multi-head attention
        values = tf.reshape(values, (N, value_len, self.head, self.head_dim))
        keys = tf.reshape(keys, (N, key_len, self.head, self.head_dim))
        queries = tf.reshape(queries, (N, query_len, self.head, self.head_dim))

        # Compute attention scores
        energy = tf.einsum("nqhd,nkhd->nhqk", queries, keys)

        # Apply the mask if present
        if mask is not None:
            mask = tf.cast(mask, dtype=tf.bool)
            energy = tf.where(mask, energy, tf.fill(tf.shape(energy), -1e9))

        # Attention weights
        attention = tf.nn.softmax(energy / tf.sqrt(tf.cast(self.head_dim, dtype=tf.float32)), axis=-1)

        # Weighted sum of values
        out = tf.einsum("nhqk,nkhd->nqhd", attention, values)
        out = tf.reshape(out, (N, query_len, self.head * self.head_dim))

        # Final linear layer
        out = self.fc(out)
        return out

class TransformerBlock(keras.Model):
    def __init__(self, embed_size, head, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention_layer = SelfAttention(embed_size, head)
        self.norm1 = keras.layers.LayerNormalization(axis=-1)
        self.norm2 = keras.layers.LayerNormalization(axis=-1)
        self.feed_forward = keras.Sequential([
            keras.layers.Dense(units=forward_expansion * embed_size, activation="relu"),
            keras.layers.Dense(units=embed_size)
        ])
        self.dropout = keras.layers.Dropout(dropout)

    def call(self, value, key, query, mask):
        attention = self.attention_layer(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))

        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class Encoder(keras.Model):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.word_embedding = keras.layers.Embedding(input_dim=src_vocab_size, output_dim=embed_size)
        self.position_embedding = keras.layers.Embedding(input_dim=max_length, output_dim=embed_size)
        self.encoder_layers = [TransformerBlock(embed_size, heads, dropout, forward_expansion) for _ in range(num_layers)]
        self.dropout = keras.layers.Dropout(dropout)

    def call(self, x, mask):
        N = tf.shape(x)[0]
        seq_length = tf.shape(x)[1]

        # Generate positions
        positions = tf.range(start=0, limit=seq_length, delta=1)
        positions = tf.expand_dims(positions, axis=0)
        positions = tf.tile(positions, [N, 1])

        # Apply embeddings
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        # Pass through transformer layers
        for layer in self.encoder_layers:
            out = layer(out, out, out, mask)

        return out

class DecoderBlock(keras.Model):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(DecoderBlock, self).__init__()
        self.norm1 = keras.layers.LayerNormalization(axis=-1)
        self.attention_layer = SelfAttention(embed_size, heads)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
        self.dropout = keras.layers.Dropout(dropout)

    def call(self, x, value, key, src_mask, trg_mask):
        attention = self.attention_layer(x, x, x, trg_mask)
        query = self.dropout(self.norm1(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out

class Decoder(keras.Model):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length):
        super(Decoder, self).__init__()
        self.word_embedding = keras.layers.Embedding(input_dim=trg_vocab_size, output_dim=embed_size)
        self.position_embedding = keras.layers.Embedding(input_dim=max_length, output_dim=embed_size)
        self.decoder_layers = [DecoderBlock(embed_size, heads, forward_expansion, dropout) for _ in range(num_layers)]
        self.fc_out = keras.layers.Dense(trg_vocab_size)
        self.dropout = keras.layers.Dropout(dropout)

    def call(self, x, enc_out, src_mask, trg_mask):
        N = tf.shape(x)[0]
        seq_length = tf.shape(x)[1]

        # Generate positions
        positions = tf.range(start=0, limit=seq_length)
        positions = tf.expand_dims(positions, axis=0)
        positions = tf.tile(positions, [N, 1])

        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        for layer in self.decoder_layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
        
        out = self.fc_out(x)
        return out

class Transformer(keras.Model):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=128,
                 num_layers=6, forward_expansion=4, heads=8, dropout=0, max_length=500):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length)
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, source):
        src_mask = tf.cast(tf.math.not_equal(source, self.src_pad_idx), tf.float32)
        src_mask = tf.expand_dims(tf.expand_dims(src_mask, axis=1), axis=2) # (batch_size, 1, 1, src_len)
        return src_mask
    
    def make_trg_mask(self, target):
        N = tf.shape(target)[0]
        target_len = tf.shape(target)[1]

        mask = tf.linalg.band_part(input = tf.ones((target_len, target_len)), num_lower = -1, num_upper = 0)    ### Lower triangular matrix
        trg_mask = tf.tile(input= tf.expand_dims(input= mask, axis= 0), multiples= [N, 1, 1])       ### (batch_size, trg_len, trg_len)
        return trg_mask

    def call(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

    def train_step(self, data):
        # Unpack the data
        (src, trg_input), trg_real = data
        
        # Forward pass
        with tf.GradientTape() as tape:
            # predictions = self(src, trg_input)  # Model call with src and trg_input
            predictions = self.call(src, trg_input)  # Model call with src and trg_input
            loss = self.compiled_loss(trg_real, predictions)  # Compute loss
        
        # Backward pass and apply gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Return metrics
        return {"loss": loss}

In [4]:
# Constants
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 10
trg_vocab_size = 10
embed_size = 128
num_layers = 2
heads = 2
forward_expansion = 4
dropout = 0.1
max_length = 9  # Length of your sequences

# Sample training data
x = tf.constant([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]], dtype= tf.float32)
trg = tf.constant([[1, 7, 4, 3, 5, 9, 2, 0, 0], [1, 5, 6, 2, 4, 7, 6, 2, 0]], dtype= tf.float32)

# Initialize the Transformer model
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, 
                    embed_size=embed_size, num_layers=num_layers, heads=heads, 
                    forward_expansion=forward_expansion, dropout=dropout, max_length=max_length)

In [5]:
trg_input = tf.cast(trg[:, :-1], dtype=tf.float32)  # Cast to float32
trg_real = tf.cast(trg[:, 1:], dtype=tf.float32)    # Cast to float32
x = tf.cast(x, dtype=tf.float32)                    # Cast source input to float32

In [6]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # Don't consider pad tokens in the loss
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


In [7]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(embed_size)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


In [8]:
model.compile(optimizer=optimizer, loss=loss_function)

In [9]:
# Define some basic training hyperparameters
epochs = 10
batch_size = 2

# Dummy masks for training (adjust if necessary for your real task)
def create_masks(src, trg):
    src_mask = model.make_src_mask(src)
    trg_mask = model.make_trg_mask(trg)
    return src_mask, trg_mask

# Create masks and use them in the dataset if necessary
src_mask = model.make_src_mask(x)
trg_mask = model.make_trg_mask(trg)

# # You can add a simple data pipeline if needed (e.g., batching, shuffling)
# train_data = tf.data.Dataset.from_tensor_slices((x, trg)).batch(batch_size)

# Prepare dataset for training
train_data = tf.data.Dataset.from_tensor_slices(((x, trg_input), trg_real)).batch(batch_size)

# Train the model using the fit method
# history = model.fit(src_mask, trg_mask, epochs=epochs)
history = model.fit(train_data, epochs=epochs)


Epoch 1/10




TypeError: Value passed to parameter 'x' has DataType int64 not in list of allowed values: bfloat16, float16, float32, float64, complex64, complex128

In [9]:
# # Training hyperparameters
# epochs = 10
# batch_size = 2

# # Create masks for source and target data
# def create_masks(src, trg):
#     src_mask = model.make_src_mask(src)
#     trg_mask = model.make_trg_mask(trg)
#     return src_mask, trg_mask

# # Training step function
# @tf.function
# def train_step(src, trg):
#     trg_input = trg[:, :-1]
#     trg_real = trg[:, 1:]

#     src_mask, trg_mask = create_masks(src, trg_input)

#     with tf.GradientTape() as tape:
#         predictions = model(src, trg_input)
#         loss = loss_function(trg_real, predictions)

#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(gradients, model.trainable_variables))

#     return loss

# # Training loop
# for epoch in range(epochs):
#     total_loss = 0

#     # Assume x and trg are your input and target batches
#     batch_count = tf.shape(x)[0] // batch_size
#     for i in range(batch_count):
#         src_batch = x[i:i + batch_size]
#         trg_batch = trg[i:i + batch_size]
#         batch_loss = train_step(src_batch, trg_batch)
#         total_loss += batch_loss

#     print(f'Epoch {epoch + 1}, Loss: {total_loss / batch_count}')
