### Assignment 4 - Encoder Decoder

In [1]:
import tensorflow as tf
import nltk
nltk.download('names')
from nltk.corpus import names
import tensorflow as tf
from tensorflow.keras.layers import Layer, MultiHeadAttention, Dense, Embedding, LayerNormalization, Dropout
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K


2024-04-24 13:01:02.100861: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-24 13:01:02.897777: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package names to /home/namrata/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [2]:
# Load names
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Label data
data = [(name.lower(), 1) for name in male_names] + [(name.lower(), 0) for name in female_names]

# Shuffle data
import random
random.shuffle(data)

# Split into names and labels
names, labels = zip(*data)
labels = tf.keras.utils.to_categorical(labels, num_classes=2)

# Tokenization and Padding
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(names)
seqs = tokenizer.texts_to_sequences(names)
max_len = max(len(seq) for seq in seqs)
names_padded = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

In [3]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dff, num_heads, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        # Multi-head attention with 'num_heads' and dimension per head 'embed_dim'
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        # Feedforward network consists of two dense layers with a dropout in between
        self.dense_proj = tf.keras.Sequential([
            Dense(dff, activation="relu"),  # First dense layer with more units (dff)
            Dropout(dropout_rate),          # Dropout layer for regularization
            Dense(embed_dim)                # Second dense layer to bring dimension back to 'embed_dim'
        ])
        # Layer normalization to help stabilize the learning process
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()
        # Dropout layer added for additional regularization
        self.dropout = Dropout(dropout_rate)
        self.supports_masking = True

    def call(self, inputs, training=False, mask=None):
        # Attention mechanism
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        # Apply dropout to attention output
        attention_output = self.dropout(attention_output, training=training)
        # Add & norm layer after adding residual connection
        proj_input = self.layernorm1(inputs + attention_output)
        # Pass through the feedforward network
        proj_output = self.dense_proj(proj_input)
        # Apply dropout to the output of the feedforward network
        proj_output = self.dropout(proj_output, training=training)
        # Second add & norm layer after adding residual connection
        return self.layernorm2(proj_input + proj_output)


In [4]:
class TransformerDecoder(Layer):
    def __init__(self, embed_dim, dff, num_heads, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        # First multi-head attention layer (self attention)
        self.attention1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        # Second multi-head attention layer (encoder-decoder attention)
        self.attention2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        # Feedforward network applied at the decoder stage
        self.dense_proj = tf.keras.Sequential([
            Dense(dff, activation="relu"),  # Dense layer with 'dff' units
            Dropout(dropout_rate),          # Dropout for regularization
            Dense(embed_dim)                # Dense layer to project back to embedding dimension
        ])
        # Three layer normalization steps to stabilize the learning
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()
        self.layernorm3 = LayerNormalization()
        # Dropout layer for regularization
        self.dropout = Dropout(dropout_rate)
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, training=False, mask=None):
        # Self-attention on decoder inputs
        attention_output1 = self.attention1(inputs, inputs, attention_mask=mask)
        # Apply dropout after self-attention
        attention_output1 = self.dropout(attention_output1, training=training)
        # First add & norm step
        attention_input1 = self.layernorm1(inputs + attention_output1)
        # Encoder-decoder attention, using output from the encoder as key and value
        attention_output2 = self.attention2(attention_input1, encoder_outputs, attention_mask=mask)
        # Apply dropout after encoder-decoder attention
        attention_output2 = self.dropout(attention_output2, training=training)
        # Second add & norm step
        attention_input2 = self.layernorm2(attention_input1 + attention_output2)
        # Feedforward network
        proj_output = self.dense_proj(attention_input2)
        # Apply dropout to the output of feedforward network
        proj_output = self.dropout(proj_output, training=training)
        # Final add & norm step
        return self.layernorm3(attention_input2 + proj_output)


In [5]:
class TransformerModel:
    def __init__(self, vocab_size, max_len, embed_dim=32, dff=64, num_heads=2, num_classes=2, dropout_rate=0.1):
        # Constructor for the Model. Set up the Vocab size, drop out rate etc. 
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.dff = dff
        self.num_heads = num_heads
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.model = self.build_model()

    def build_model(self):
        inputs = tf.keras.Input(shape=(self.max_len,))
        x = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim)(inputs)
        x = TransformerEncoder(self.embed_dim, self.dff, self.num_heads, self.dropout_rate)(x) # setup Encoder
        x = TransformerDecoder(self.embed_dim, self.dff, self.num_heads, self.dropout_rate)(x, x) # Setup Decoder
        x = Dense(self.num_classes, activation="softmax")(x[:, 0, :])
        model = Model(inputs, x)
        return model

    def compile(self, optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]):
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    def fit(self, *args, **kwargs):
        return self.model.fit(*args, **kwargs)

    def predict(self, *args, **kwargs):
        return self.model.predict(*args, **kwargs)


In [6]:
# Create an instance of the TransformerModel
transformer = TransformerModel(vocab_size=vocab_size, max_len=max_len)

# Compile the model
transformer.compile()


2024-04-24 13:02:53.470493: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-24 13:02:53.522220: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
def train_model(model, names, labels, epochs=1, batch_size=32):
    # Ensure the dataset is correctly prepared
    dataset = tf.data.Dataset.from_tensor_slices((names_padded, labels))
    dataset = dataset.shuffle(buffer_size=len(names_padded)).batch(batch_size=32)

    # Use the fit method of the model for training which handles batches and losses internally
    history = model.fit(dataset, epochs=10)

    loss = history.history['loss'][-1]  # Get the final loss from the history
    accuracy = history.history['accuracy'][-1] 

    print("-"*100)
    print(f"Loss : {loss:.4f} , Test Accuracy: {accuracy * 100:.2f}%")

# Execute training
train_model(transformer, names_padded, labels, epochs=10, batch_size=32)


Epoch 1/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 46ms/step - accuracy: 0.7793 - loss: 0.4744
Epoch 2/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 47ms/step - accuracy: 0.7787 - loss: 0.4700
Epoch 3/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 47ms/step - accuracy: 0.7802 - loss: 0.4639
Epoch 4/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 46ms/step - accuracy: 0.7773 - loss: 0.4658
Epoch 5/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 48ms/step - accuracy: 0.7837 - loss: 0.4629
Epoch 6/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 48ms/step - accuracy: 0.7860 - loss: 0.4579
Epoch 7/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 47ms/step - accuracy: 0.7765 - loss: 0.4647
Epoch 8/10
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 47ms/step - accuracy: 0.7829 - loss: 0.4611
Epoch 9/10
[1m249/249[

In [9]:
new_names = ["Charlie", "Martina", "John", "Elizabeth","Rita","Harry"]

# Tokenize and pad the new names
new_seqs = tokenizer.texts_to_sequences(new_names)
new_names_padded = tf.keras.preprocessing.sequence.pad_sequences(new_seqs, maxlen=max_len, padding='post')


In [10]:
# Predict using the trained model
predictions = transformer.predict(new_names_padded)

# Output the predictions
print(predictions)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 854ms/step
[[0.8492838  0.15071622]
 [0.7928341  0.20716597]
 [0.16434601 0.83565396]
 [0.7242352  0.27576482]
 [0.91736984 0.08263018]
 [0.34865716 0.65134287]]


In [11]:
predicted_labels = tf.argmax(predictions, axis=1).numpy()
class_names = ["Female", "Male"]

# Print predicted class names:
for name, label in zip(new_names, predicted_labels):
    print(f"Name: {name}, Predicted Gender: {class_names[label]}")


Name: Charlie, Predicted Gender: Female
Name: Martina, Predicted Gender: Female
Name: John, Predicted Gender: Male
Name: Elizabeth, Predicted Gender: Female
Name: Rita, Predicted Gender: Female
Name: Harry, Predicted Gender: Male


##### References:


https://kikaben.com/transformers-encoder-decoder/#google_vignette


https://machinelearningmastery.com/implementing-the-transformer-encoder-from-scratch-in-tensorflow-and-keras/