# try real transformer encoder

In [1]:
import numpy as np
import pandas as pd

# extracted feature
trainDS = np.load("/home/jireh/MT/video_sync_v1/trash/featureData.npy",allow_pickle=True)
print(trainDS[0][0].shape)

# Convert to a pandas DataFrame (optional, for better visualization)
df = pd.DataFrame(trainDS)
# Display the first few rows
df.head()

(20, 48)


Unnamed: 0,0,1,2,3,4
0,"[[0.0, 0.0, 0.0, 0.69037163, 0.0, 0.0, 0.0, 0....","[[0.0, 0.0, 0.0, 1.0563793, 0.0, 0.0, 0.0, 0.8...",1963,1983,20
1,"[[0.0, 0.0, 0.0, 0.8872391, 0.0, 0.0, 0.0, 1.2...","[[0.0, 0.0, 0.0, 0.37041047, 0.0, 0.0, 0.0, 0....",8545,8554,9
2,"[[0.0, 0.0, 0.0, 1.340517, 0.0, 0.0, 0.0, 0.64...","[[0.0, 0.0, 0.0, 1.8816519, 0.0, 0.0, 0.0, 0.4...",2322,2326,4
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3974726...","[[0.0, 0.0, 0.0, 0.16040325, 0.0, 0.0, 0.0, 0....",7385,7398,13
4,"[[0.0, 0.0, 0.0, 1.4631886, 0.0, 0.0, 0.0, 0.4...","[[0.0, 0.0, 0.0, 1.233583, 0.0, 0.0, 0.0, 0.16...",7953,7941,-12


In [29]:
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, optimizers, losses
import numpy as np

# Sine and cosine positional encoding function
def get_positional_encoding(max_seq_len, d_model):
    position_enc = np.array([
        [pos / np.power(10000, 2 * (i // 2) / d_model) for i in range(d_model)]
        for pos in range(max_seq_len)
    ])
    
    # Apply sine to even indices and cosine to odd indices
    position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # even indices
    position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # odd indices
    
    return tf.convert_to_tensor(position_enc, dtype=tf.float32)

# MLP head definition
class MLPHead(tf.keras.layers.Layer):
    def __init__(self, hidden_dim, num_classes, dropout_rate=0.1):
        super(MLPHead, self).__init__()
        self.dense1 = layers.Dense(hidden_dim, activation='relu', kernel_regularizer=regularizers.l2(0.01))
        self.dropout = layers.Dropout(dropout_rate)
        self.dense2 = layers.Dense(num_classes, kernel_regularizer=regularizers.l2(0.01))
        self.softmax = layers.Softmax()

    def call(self, inputs, training=False):
        x = self.dense1(inputs)
        x = self.dropout(x, training=training)
        x = self.dense2(x)
        return self.softmax(x)

# Transformer model definition
class TransformerModel(tf.keras.Model):
    def __init__(self, num_heads, dff, num_layers, hidden_dim, num_classes, max_seq_len=20, rate=0.1):
        super(TransformerModel, self).__init__()
        self.num_layers = num_layers
        self.class_token = self.add_weight("class_token", shape=[1, 1, 48], initializer="random_normal")
        
        # Incorporate positional encoding
        self.positional_encoding = get_positional_encoding(max_seq_len, 48)
        
        self.encoder_layers = [layers.MultiHeadAttention(num_heads=num_heads, key_dim=48) for _ in range(num_layers)]
        self.ffn_layers = [self.point_wise_feed_forward_network(48, dff) for _ in range(num_layers)]
        self.dropout_layers = [layers.Dropout(rate) for _ in range(num_layers)]
        self.mlp_head = MLPHead(hidden_dim, num_classes, dropout_rate=rate)

    def point_wise_feed_forward_network(self, d_model, dff):
        return tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model)
        ])
    
    # Model forward pass
    def call(self, inputs, training=True):
        query = inputs['query']
        context = inputs['context']
        batch_size = tf.shape(query)[0]
        seq_len = tf.shape(query)[1]
        
        # Add positional encoding to query and context
        query += self.positional_encoding[:seq_len, :]
        context += self.positional_encoding[:seq_len, :]
        
        # Class token processing
        class_token = tf.broadcast_to(self.class_token, [batch_size, 1, 48])
        
        # Concatenate class token to the query
        query_with_token = tf.concat([class_token, query], axis=1)

        for i in range(self.num_layers):
            # Use the full query with the class token, context as key and value
            query_with_token = self.encoder_layers[i](query=query_with_token, value=context, key=context, training=training)
            query_with_token = self.ffn_layers[i](query_with_token)
            query_with_token = self.dropout_layers[i](query_with_token, training=training)
        
        # Extract the class token from the modified query (first token)
        class_token_output = query_with_token[:, 0, :]  # Extract the first token, corresponding to the class token
        
        # Pass through the MLP head for final classification
        output = self.mlp_head(class_token_output, training=training)
        return output


## with Residual Connections and Norm

In [16]:
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, optimizers, losses
import numpy as np

# Sine and cosine positional encoding function
def get_positional_encoding(max_seq_len, d_model):
    position_enc = np.array([
        [pos / np.power(10000, 2 * (i // 2) / d_model) for i in range(d_model)]
        for pos in range(max_seq_len)
    ])
    
    # Apply sine to even indices and cosine to odd indices
    position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # even indices
    position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # odd indices
    
    return tf.convert_to_tensor(position_enc, dtype=tf.float32)

# MLP head definition
class MLPHead(tf.keras.layers.Layer):
    def __init__(self, hidden_dim, num_classes, dropout_rate=0.1):
        super(MLPHead, self).__init__()
        self.dense1 = layers.Dense(hidden_dim, activation='relu', kernel_regularizer=regularizers.l2(0.01))
        self.dropout = layers.Dropout(dropout_rate)
        self.dense2 = layers.Dense(num_classes, kernel_regularizer=regularizers.l2(0.01))
        self.softmax = layers.Softmax()

    def call(self, inputs, training=False):
        x = self.dense1(inputs)
        x = self.dropout(x, training=training)
        x = self.dense2(x)
        return self.softmax(x)

# Transformer model definition
class TransformerModel(tf.keras.Model):
    def __init__(self, num_heads, dff, num_layers, hidden_dim, num_classes, max_seq_len=20, rate=0.1):
        super(TransformerModel, self).__init__()
        self.num_layers = num_layers
        self.class_token = self.add_weight("class_token", shape=[1, 1, 48], initializer="random_normal")
        
        # Incorporate positional encoding
        self.positional_encoding = get_positional_encoding(max_seq_len, 48)
        
        # Encoder layers with residual and normalization
        self.encoder_layers = [layers.MultiHeadAttention(num_heads=num_heads, key_dim=48) for _ in range(num_layers)]
        self.ffn_layers = [self.point_wise_feed_forward_network(48, dff) for _ in range(num_layers)]
        self.norm_layers_1 = [layers.LayerNormalization(epsilon=1e-6) for _ in range(num_layers)]
        self.norm_layers_2 = [layers.LayerNormalization(epsilon=1e-6) for _ in range(num_layers)]
        self.dropout_layers = [layers.Dropout(rate) for _ in range(num_layers)]
        self.mlp_head = MLPHead(hidden_dim, num_classes, dropout_rate=rate)

    def point_wise_feed_forward_network(self, d_model, dff):
        return tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model)
        ])
    
    # Model forward pass
    def call(self, inputs, training=True):
        query = inputs['query']
        context = inputs['context']
        batch_size = tf.shape(query)[0]
        seq_len = tf.shape(query)[1]
        
        # Add positional encoding to query and context
        query += self.positional_encoding[:seq_len, :]
        context += self.positional_encoding[:seq_len, :]
        
        # Class token processing
        class_token = tf.broadcast_to(self.class_token, [batch_size, 1, 48])
        
        # Concatenate class token to the query
        query_with_token = tf.concat([class_token, query], axis=1)

        for i in range(self.num_layers):
            # Pre-Norm for Multi-Head Attention
            query_with_token_norm = self.norm_layers_1[i](query_with_token)
            # Multi-Head Attention with residual connection
            attn_output = self.encoder_layers[i](query=query_with_token_norm, value=context, key=context, training=training)
            query_with_token = query_with_token + attn_output  # Residual connection

            # Pre-Norm for Feed-Forward Network
            query_with_token_norm = self.norm_layers_2[i](query_with_token)
            # Feed-Forward Network with residual connection
            ffn_output = self.ffn_layers[i](query_with_token_norm)
            query_with_token = query_with_token + ffn_output  # Residual connection
            
            query_with_token = self.dropout_layers[i](query_with_token, training=training)
        
        # Extract the class token from the modified query (first token)
        class_token_output = query_with_token[:, 0, :]  # Extract the first token, corresponding to the class token
        
        # Pass through the MLP head for final classification
        output = self.mlp_head(class_token_output, training=training)
        return output


In [17]:
# Custom callback to output top 20 predictions and their corresponding true labels after each epoch
class PredictionCallback(tf.keras.callbacks.Callback):
    def __init__(self, val_data):
        super(PredictionCallback, self).__init__()
        self.val_data = val_data

    def on_train_end(self, logs=None):
        # Get the first batch of the validation data
        for batch in self.val_data.take(1):
            inputs, true_labels = batch
            predictions = self.model.predict(inputs)
            
            pred_classes = []
            true_classes = []
            
            # Get the top 20 predictions and their corresponding true labels
            top_20_pred = predictions[:20]
            top_20_true = true_labels[:20]

            for i in range(20):
                pred_class = tf.argmax(top_20_pred[i]).numpy()
                true_class = tf.argmax(top_20_true[i]).numpy()
                pred_classes.append(pred_class)
                true_classes.append(true_class)

            print("Predicted classes:", pred_classes)
            print("True classes:", true_classes)

In [18]:
left_features = trainDS[:, 0].tolist()
right_features = trainDS[:, 1].tolist()
y = trainDS[:, 4].tolist()

left_features = np.array(left_features)
right_features = np.array(right_features)
y = np.array(y)

left_features = tf.convert_to_tensor(left_features, dtype=tf.float32)
right_features = tf.convert_to_tensor(right_features, dtype=tf.float32)
y = to_categorical(y,num_classes=40)

size = int(len(y) * 0.8)  # 80% of the data for training

left_features_train, left_features_val = left_features[:size], left_features[size:]
right_features_train, right_features_val = right_features[:size], right_features[size:]
y_train, y_val = y[:size], y[size:]

train_dataset = tf.data.Dataset.from_tensor_slices(({'query': left_features_train, 'context': right_features_train}, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(256).repeat()  

val_dataset = tf.data.Dataset.from_tensor_slices(({'query': left_features_val, 'context': right_features_val}, y_val))
val_dataset = val_dataset.batch(128)

train_steps_per_epoch = len(left_features_train) // 256
val_steps_per_epoch = len(left_features_val) // 128

print(f"Training dataset size: {len(left_features_train)} samples")
print(f"Validation dataset size: {len(left_features_val)} samples")

Training dataset size: 8000 samples
Validation dataset size: 2000 samples


In [19]:
# Instantiate the Transformer model
hidden_dim = 512  # Example hidden dimension for MLP head
transformer = TransformerModel(num_heads=4, dff=128, num_layers=2, hidden_dim=hidden_dim, num_classes=40)

# Optimizer
optimizer = optimizers.Adam(learning_rate=1e-3)

# Compile the model
transformer.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model with the custom callback
transformer.fit(
    train_dataset,
    epochs=30,
    steps_per_epoch=train_steps_per_epoch,
    validation_data=val_dataset,
    validation_steps=val_steps_per_epoch,
    callbacks=[PredictionCallback(val_dataset)]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Predicted classes: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
True classes: [13, 23, 0, 3, 22, 28, 35, 26, 37, 39, 16, 36, 28, 14, 11, 20, 33, 36, 13, 28]


<keras.callbacks.History at 0x7f460c2e00d0>