In [4]:
import tensorflow as tf
from tensorflow import keras

In [7]:
from tensorflow.keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout
from multihead_attention import MultiHeadAttention
from positional_encoding import PositionEmbeddingFixedWeights

# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super(AddNormalization, self).__init__(**kwargs)
        self.layer_norm = LayerNormalization()  # Layer normalization layer

    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x

        # Apply layer normalization to the sum
        return self.layer_norm(add)

# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super(FeedForward, self).__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff)  # First fully connected layer
        self.fully_connected2 = Dense(d_model)  # Second fully connected layer
        self.activation = ReLU()  # ReLU activation layer

    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)

        return self.fully_connected2(self.activation(x_fc1))

# Implementing the Encoder Layer
class EncoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()

    def call(self, x, padding_mask, training):
        # Multi-head attention layer
        multihead_output = self.multihead_attention(x, x, x, padding_mask)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Add in a dropout layer
        multihead_output = self.dropout1(multihead_output, training=training)

        # Followed by an Add & Norm layer
        addnorm_output = self.add_norm1(x, multihead_output)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Add in another dropout layer
        feedforward_output = self.dropout2(feedforward_output, training=training)

        # Followed by another Add & Norm layer
        return self.add_norm2(addnorm_output, feedforward_output)

# Implementing the Encoder
class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]

    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)

        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.encoder_layer):
            x = layer(x, padding_mask, training)

        return x

ModuleNotFoundError: No module named 'multihead_attention'

In [8]:
!pip install multihead_attention

ERROR: Could not find a version that satisfies the requirement multihead_attention (from versions: none)
ERROR: No matching distribution found for multihead_attention


In [11]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences



# Define hyperparameters
input_size = 128 # Number of unique notes in the music
hidden_size = 256 # Dimensionality of the hidden layer
output_size = 128 # Number of possible next notes to choose from
num_layers = 4 # Number of layers in the transformer encoder
num_heads = 8 # Number of attention heads in each layer
dropout = 0.2 # Dropout probability for regularization
batch_size = 64 # Number of examples to process in each training step
num_epochs = 10 # Number of times to iterate over the dataset during training

# Define the transformer model
class TransformerModel(tf.keras.Model):
    def __init__(self, input_size, hidden_size, output_size, num_layers, num_heads, dropout):
        super(TransformerModel, self).__init__()
        self.embedding = layers.Embedding(input_size, hidden_size)
        self.transformer_encoder = layers.TransformerEncoderLayer(
            num_layers=num_layers,
            d_model=hidden_size,
            num_heads=num_heads,
            dropout=dropout,
            activation='relu'
        )
        self.fc = layers.Dense(output_size)
        
    def call(self, x):
        embedded = self.embedding(x)
        output = self.transformer_encoder(embedded)
        output = self.fc(output)
        return output

# Load the music data
data = [] # Replace this with your actual music data

# Convert the music data to sequences of indices
max_sequence_length = 100 # Maximum length of the input sequence
note_to_idx = {} # Mapping from notes to indices
idx_to_note = {} # Mapping from indices to notes
for sequence in data:
    for note in sequence:
        if note not in note_to_idx:
            idx = len(note_to_idx)
            note_to_idx[note] = idx
            idx_to_note[idx] = note
sequences = []
for sequence in data:
    sequence_indices = [note_to_idx[note] for note in sequence]
    if len(sequence_indices) > max_sequence_length:
        sequence_indices = sequence_indices[:max_sequence_length]
    sequences.append(sequence_indices)

# Split the sequences into inputs and targets
inputs = [sequence[:-1] for sequence in sequences]
targets = [sequence[1:] for sequence in sequences]

# Pad the inputs and targets to the same length
max_sequence_length = max(len(sequence) for sequence in sequences)
inputs = pad_sequences(inputs, maxlen=max_sequence_length, padding='post')
targets = pad_sequences(targets, maxlen=max_sequence_length, padding='post')

# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(batch_size)

# Create the transformer model and optimizer
model = TransformerModel(input_size, hidden_size, output_size, num_layers, num_heads, dropout)
optimizer = tf.keras.optimizers.Adam()

# Define the loss function
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Train the model
for epoch in range(num_epochs):
    for step, (x, y) in enumerate(dataset):
        with tf.GradientTape() as tape:
            logits = model(x)
            loss = loss_fn(y, logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        if step % 100 == 0:
            print('Epoch: %d, Step: %d, Loss: %.4f' % (epoch, step, loss.numpy()))

# Generate new music using the


ValueError: max() arg is an empty sequence

In [None]:
# Generate new music using the transformer model
import numpy as np

# Start with a seed sequence of notes
seed_sequence = [0, 2, 4, 5, 7, 9, 11, 12] # Replace this with your own seed sequence
sequence = np.array(seed_sequence)

# Generate new notes by repeatedly predicting the next note and adding it to the sequence
num_notes_to_generate = 100 # Number of notes to generate
for i in range(num_notes_to_generate):
    # Reshape the sequence to have a batch size of 1 and a sequence length of its current length
    input_sequence = sequence.reshape(1, -1)
    
    # Predict the logits for the next note
    logits = model(input_sequence)
    logits = logits[:, -1, :] # Select the logits for the last note in the sequence
    
    # Sample the index of the next note from the logits using a temperature of 1.0
    next_note_index = tf.random.categorical(logits / 1.0, num_samples=1)
    next_note_index = int(next_note_index.numpy())
    
    # Add the next note to the sequence
    sequence = np.append(sequence, next_note_index)

# Convert the sequence of note indices back to a sequence of notes
generated_notes = [idx_to_note[idx] for idx in sequence]


In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emded_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=emded_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=emded_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


class TransformerModel(keras.Model):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.attention_layers = [layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) for _ in range(num_layers)]
        self.dropout_layers = [layers.Dropout(dropout) for _ in range(num_layers)]
        self.dense_layers = [layers.Dense(ff_dim, activation='relu') for _ in range(num_layers)]
        self.encoder = layers.Sequential([
            layer for i in range(num_layers)
            for layer in [self.attention_layers[i], self.dropout_layers[i], self.dense_layers[i], self.dropout_layers[i]]
        ])
        self.output_layer = layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        x = self.embedding_layer(inputs)
        x = self.encoder(x)
        x = self.output_layer(x)
        return x


# Load your dataset and prepare it for training

# Define the parameters for the model
maxlen = 100  # Maximum sequence length
vocab_size = 5000  # Vocabulary size
embed_dim = 256  # Embedding dimension
num_heads = 8  # Number of attention heads
ff_dim = 512  # Hidden layer size in feedforward network
num_layers = 4  # Number of encoder layers in the transformer model
dropout = 0.1  # Dropout rate

# Create an instance of the model and compile it
model = TransformerModel(maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_layers, dropout)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val))


AttributeError: module 'tensorflow.keras.layers' has no attribute 'Sequential'

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class MusicDataset(Dataset):
    def __init__(self, notes, sequence_length):
        self.sequence_length = sequence_length
        self.notes = notes
        self.note_to_int = {note: i for i, note in enumerate(set(notes))}
        self.int_to_note = {i: note for note, i in self.note_to_int.items()}
        self.inputs, self.targets = self.create_sequences()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

    def create_sequences(self):
        inputs = []
        targets = []
        for i in range(len(self.notes) - self.sequence_length):
            inputs.append([self.note_to_int[note] for note in self.notes[i:i+self.sequence_length]])
            targets.append(self.note_to_int[self.notes[i+self.sequence_length]])
        return torch.tensor(inputs), torch.tensor(targets)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout),
            num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x)
        return x

notes = ['C', 'D', 'E', 'F', 'G', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'A', 'B', 'C']
sequence_length = 4
batch_size = 2
epochs = 100
lr = 0.001
d_model = 128
nhead = 4
num_layers = 4
dim_feedforward = 256
dropout = 0.2

dataset = MusicDataset(notes, sequence_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = TransformerModel(len(dataset.note_to_int), d_model, nhead, num_layers, dim_feedforward, dropout)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)



In [18]:
from torch.utils.data import DataLoader

batch_size = 2
dataset = YourDataset()  # replace with your dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for input_batch, target_batch in dataloader:
    # train your model on the input_batch and target_batch
    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(dataloader, 0):
            inputs, targets = data
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs.view(-1, len(dataset.note_to_int)), targets.view(-1))
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch + 1} loss: {running_loss / len(dataloader)}")

NameError: name 'YourDataset' is not defined

In [15]:
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        inputs, targets = data
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(dataset.note_to_int)), targets.view(-1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch + 1} loss: {running_loss / len(dataloader)}")

# Generate a sequence of notes
model.eval()
start_sequence = ['C', 'D', 'E', 'F']
input_sequence = torch.tensor([[dataset.note_to_int[note] for note in start_sequence]])
for i in range(10):
    output = model(input_sequence)
    predicted_note_index = torch.argmax(output[:, -1, :], dim=1)
    predicted_note = dataset.int_to_note[predicted_note_index.item()]
    print(predicted_note)
    input_sequence = torch.cat([input_sequence[:, 1:], predicted_note_index.unsqueeze(0)], dim=1)


ValueError: Expected input batch_size (8) to match target batch_size (2).

In [20]:
import tensorflow as tf
import numpy as np

# Constants
NUM_NOTES = 128
MAX_SEQ_LEN = 100
EMB_DIM = 128
HIDDEN_DIM = 256
NUM_HEADS = 8
NUM_BLOCKS = 4
DROPOUT_RATE = 0.1
LEARNING_RATE = 0.001

# Load the data
#x_train = np.load("x_train.npy")
#y_train = np.load("y_train.npy")

# Define the transformer model
inputs = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32)
emb = tf.keras.layers.Embedding(input_dim=NUM_NOTES, output_dim=EMB_DIM)(inputs)
pos_enc = tf.keras.layers.Lambda(lambda x: x * tf.math.sqrt(tf.cast(EMB_DIM, tf.float32)))(emb)
pos_enc = tf.keras.layers.Lambda(lambda x: x + tf.expand_dims(tf.range(MAX_SEQ_LEN, dtype=tf.float32), axis=0))(pos_enc)
attention = tf.keras.layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMB_DIM)(pos_enc, pos_enc)
attention = tf.keras.layers.Dropout(DROPOUT_RATE)(attention)
attention = tf.keras.layers.LayerNormalization()(emb + attention)
ff = tf.keras.layers.Dense(units=HIDDEN_DIM, activation="relu")(attention)
ff = tf.keras.layers.Dense(units=EMB_DIM)(ff)
ff = tf.keras.layers.Dropout(DROPOUT_RATE)(ff)
ff = tf.keras.layers.LayerNormalization()(attention + ff)

for i in range(NUM_BLOCKS):
    attention = tf.keras.layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMB_DIM)(ff, ff)
    attention = tf.keras.layers.Dropout(DROPOUT_RATE)(attention)
    attention = tf.keras.layers.LayerNormalization()(ff + attention)
    ff = tf.keras.layers.Dense(units=HIDDEN_DIM, activation="relu")(attention)
    ff = tf.keras.layers.Dense(units=EMB_DIM)(ff)
    ff = tf.keras.layers.Dropout(DROPOUT_RATE)(ff)
    ff = tf.keras.layers.LayerNormalization()(attention + ff)

outputs = tf.keras.layers.Dense(units=NUM_NOTES, activation="softmax")(ff)
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=10)


ValueError: Exception encountered when calling layer "lambda_1" (type Lambda).

Dimensions must be equal, but are 128 and 100 for '{{node lambda_1/add}} = AddV2[T=DT_FLOAT](Placeholder, lambda_1/ExpandDims)' with input shapes: [?,100,128], [1,100].

Call arguments received by layer "lambda_1" (type Lambda):
  • inputs=tf.Tensor(shape=(None, 100, 128), dtype=float32)
  • mask=None
  • training=None

In [27]:
from tensorflow.keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout
from tensorflow.keras.layers import MultiHeadAttention, Embedding
#from positional_encoding import PositionEmbeddingFixedWeights

# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super(AddNormalization, self).__init__(**kwargs)
        self.layer_norm = LayerNormalization()  # Layer normalization layer

    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x

        # Apply layer normalization to the sum
        return self.layer_norm(add)

# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super(FeedForward, self).__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff)  # First fully connected layer
        self.fully_connected2 = Dense(d_model)  # Second fully connected layer
        self.activation = ReLU()  # ReLU activation layer

    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)

        return self.fully_connected2(self.activation(x_fc1))

# Implementing the Encoder Layer
class EncoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()

    def call(self, x, padding_mask, training):
        # Multi-head attention layer
        multihead_output = self.multihead_attention(x, x, x, padding_mask)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Add in a dropout layer
        multihead_output = self.dropout1(multihead_output, training=training)

        # Followed by an Add & Norm layer
        addnorm_output = self.add_norm1(x, multihead_output)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Add in another dropout layer
        feedforward_output = self.dropout2(feedforward_output, training=training)

        # Followed by another Add & Norm layer
        return self.add_norm2(addnorm_output, feedforward_output)

# Implementing the Encoder
class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.pos_encoding = Embedding(sequence_length, vocab_size)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]

    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        # Expected output shape = (batch_size, sequence_length, d_model)

        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)

        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.encoder_layer):
            x = layer(x, padding_mask, training)

        return x

In [31]:
from numpy import random

enc_vocab_size = 20 # Vocabulary size for the encoder
input_seq_length = 5  # Maximum length of the input sequence
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_ff = 2048  # Dimensionality of the inner fully connected layer
d_model = 512  # Dimensionality of the model sub-layers' outputs
n = 6  # Number of layers in the encoder stack

batch_size = 64  # Batch size from the training process
dropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers

input_seq = random.random((batch_size, input_seq_length))

encoder = Encoder(enc_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
print(encoder(input_seq, None, True))

ValueError: Exception encountered when calling layer 'multi_head_attention_22' (type MultiHeadAttention).

Invalid value 512 received for `rate`, expected a value between 0 and 1.

Call arguments received by layer 'multi_head_attention_22' (type MultiHeadAttention):
  • query=tf.Tensor(shape=(64, 5, 20), dtype=float32)
  • value=tf.Tensor(shape=(64, 5, 20), dtype=float32)
  • key=tf.Tensor(shape=(64, 5, 20), dtype=float32)
  • attention_mask=None
  • return_attention_scores=False
  • training=True
  • use_causal_mask=False

In [32]:
#now main

In [None]:
import tensorflow as tf

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        
        output = self.dense(concat_attention)
        
        return output, attention_weights

def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    
    return output, attention_weights

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2

    
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
        
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        
        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        
        # adding embedding and position encoding.
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
        
        return x
                 