# Data Loading and Preprocessing
The dataset was pre-split into training and testing sets for consistency. Only label encoding was performed at this stage, specifically for the benchmark model. Further model-specific preprocessing is described in later sections.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

import time

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import layers, models

from tensorflow.keras.callbacks import Callback
from tensorflow.keras.optimizers import AdamW

In [None]:
train_data = np.load('/content/drive/MyDrive/train.npy', allow_pickle=True)
X_train = [item['data'] for item in train_data]
y_train = [item['label'] for item in train_data]
file_name_train = [item['file_name'] for item in train_data]

# # Convert to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
file_name_train = np.array(file_name_train)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of file_name_train:", file_name_train.shape)

test_data = np.load('/content/drive/MyDrive/test.npy', allow_pickle=True)
X_test = [item['data'] for item in test_data]
y_test = [item['label'] for item in test_data]
file_name_test = [item['file_name'] for item in test_data]

# Convert to NumPy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)
file_name_test = np.array(file_name_test)

print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)
print("Shape of file_name_test:", file_name_test.shape)

Shape of X_train: (1185, 113, 225)
Shape of y_train: (1185,)
Shape of file_name_train: (1185,)
Shape of X_test: (60, 113, 225)
Shape of y_test: (60,)
Shape of file_name_test: (60,)


In [None]:
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit and transform the labels
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Preserve the y_test in numpy array format
y_test_encoded_np = np.array(y_test)

# Map original labels to integers
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Base Model (Benchmark)
This base model demonstrated the potential of the Transformer architecture for this task. To account for the inherent randomness in neural network training, each model variation, including this base model, was trained three times. The average and standard deviation of key metrics (accuracy, loss, and inference time) were then calculated for comparison.<br>
The base model achieved the following results:

* Average Training Time: 54.94 seconds ± 1.85 seconds
* Average Inference Time: 5.16 seconds ± 0.00 seconds
* Average Accuracy: 0.6444 ± 0.0478
* Average Loss: 1.8348 ± 0.0670

In [None]:
# Check whether GPU available
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
def create_transformer_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    # Positional Encoding
    positional_encoding = layers.Embedding(input_dim=input_shape[0], output_dim=input_shape[1])(tf.range(input_shape[0]))
    x = inputs + positional_encoding

    # Transformer Encoder
    for _ in range(4):  # Number of Transformer blocks
        attention_output = layers.MultiHeadAttention(num_heads=4, key_dim=64)(x, x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attention_output)
        ff_output = layers.Dense(225, activation='relu')(x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ff_output)

    # Global Average Pooling
    x = layers.GlobalAveragePooling1D()(x)

    # Output Layer
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    return models.Model(inputs, outputs)

In [None]:
input_shape = (113, 225)  # (n_frames, n_keypoints * n_coordinates)
num_classes = 30
val_accuracy_scores = []
val_loss_scores = []
training_times = []
inference_times = []
all_predictions = []

# Run model training 3 times
for i in range(3):
    # Instantiate the model
    transformer_model = create_transformer_model(input_shape, num_classes)

    transformer_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    print(f"Training Run {i+1}")
    start_time = time.time()  # Start the timer

    history = transformer_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=30,
        batch_size=32,
    )

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    training_times.append(elapsed_time)
    print(f"Training Time Run {i+1}: {elapsed_time:.2f} seconds\n")

    print(f"Inference Run {i+1}")
    start_time = time.time()  # Start the timer

    predictions = transformer_model.predict(X_test)

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    inference_times.append(elapsed_time)
    print(f"Inference Time Run {i+1}: {elapsed_time:.2f} seconds\n")
    all_predictions.append(predictions)

    val_accuracy_scores.append(history.history['val_accuracy'][-1])
    val_loss_scores.append(history.history['val_loss'][-1])

Training Run 1
Epoch 1/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 268ms/step - accuracy: 0.0713 - loss: 3.5374 - val_accuracy: 0.1167 - val_loss: 2.7976
Epoch 2/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.1500 - loss: 2.6963 - val_accuracy: 0.3167 - val_loss: 2.4938
Epoch 3/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.3490 - loss: 1.8920 - val_accuracy: 0.2667 - val_loss: 2.2554
Epoch 4/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.4449 - loss: 1.4753 - val_accuracy: 0.3667 - val_loss: 2.7555
Epoch 5/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.4705 - loss: 1.5337 - val_accuracy: 0.3833 - val_loss: 2.0636
Epoch 6/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.6358 - loss: 0.9607 - val_accuracy: 0.3500 - val_loss: 1.8596
Epoch 7/30
[1m38



[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 2s/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2s/step
Inference Time Run 3: 5.17 seconds



## Result

In [None]:
def compute_and_print_metrics(training_times, inference_times, val_accuracy_scores, val_loss_scores):
    # Compute average and standard deviation
    avg_training_time = np.mean(training_times)
    std_training_time = np.std(training_times)
    avg_inference_time = np.mean(inference_times)
    std_inference_time = np.std(inference_times)
    avg_accuracy = np.mean(val_accuracy_scores)
    std_accuracy = np.std(val_accuracy_scores)
    avg_loss = np.mean(val_loss_scores)
    std_loss = np.std(val_loss_scores)

    print(f"Average Training Time: {avg_training_time:.2f} seconds, Std Deviation {std_training_time:.2f} seconds")
    print(f"Average Inference Time: {avg_inference_time:.2f} seconds, Std Deviation {std_inference_time:.2f} seconds")
    print(f"Average Accuracy: {avg_accuracy:.4f}, Std Dev: {std_accuracy:.4f}")
    print(f"Average Loss: {avg_loss:.4f}, Std Dev: {std_loss:.4f}")

In [None]:
compute_and_print_metrics(training_times, inference_times, val_accuracy_scores, val_loss_scores)

Average Training Time: 54.94 seconds, Std Deviation 1.85 seconds
Average Inference Time: 5.16 seconds, Std Deviation 0.00 seconds
Average Accuracy: 0.6444, Std Dev: 0.0478
Average Loss: 1.8348, Std Dev: 0.0670


Misclassified Labels

In [None]:
def get_misclassified_dataframe(all_predictions, label_mapping, y_test):

    # Convert to integer labels
    int_predictions = [np.argmax(p, axis=1) for p in all_predictions]
    # Stack predictions for each run
    stacked_predictions = np.array(int_predictions)  # Shape: (3, n_samples)
    # Majority vote
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=stacked_predictions)

    # Reverse the dictionary
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    # Get the misclassified index
    wrong_indices = np.where(final_predictions != y_test)[0]
    # Create a DataFrame for misclassified samples
    misclassified_df = pd.DataFrame({
        "Test Index": wrong_indices,
        "True Label": [reverse_label_mapping[y_test[i]] for i in wrong_indices],
        "Predicted Label": [reverse_label_mapping[final_predictions[i]] for i in wrong_indices],
        "File Name": [file_name_test[i] for i in wrong_indices]
    })
    return misclassified_df

In [None]:
# Display the DataFrame
misclassified_df = get_misclassified_dataframe(all_predictions, label_mapping, y_test)
print(misclassified_df)

    Test Index True Label Predicted Label       File Name
0            2      teman         panggil    teman_07.npy
1            3      teman            guru    teman_09.npy
2            8    sedikit            anak  sedikit_01.npy
3           20      marah          kertas    marah_09.npy
4           22      makan           minum    makan_04.npy
5           23      makan           minum    makan_09.npy
6           25       main         gembira     main_10.npy
7           27       maaf             ibu     maaf_09.npy
8           28      lihat             ibu    lihat_04.npy
9           29      lihat            haus    lihat_07.npy
10          31      lapar            anak    lapar_07.npy
11          33     kucing         gembira   kucing_10.npy
12          34     kertas            main   kertas_01.npy
13          35     kertas            main   kertas_03.npy
14          39      jalan            anak    jalan_09.npy
15          41        ibu          dengar      ibu_03.npy
16          44

# Models Variation

# Model A

Building upon the base model, eight key improvements were implemented in Model A. These improvements focused on three main areas: input preprocessing (addition of keypoint angles, standardization of coordinates, label smoothing), Transformer architecture (pre-normalization, increased attention heads, and additional feedforward layers), and compiler settings/training strategy (weight decay, dynamic learning rate scheduling, varied batch sizes, and early stopping).<br>
The Model A achieved the following results:

* Average Training Time: 56.12 seconds ± 8.12 seconds
* Average Inference Time: 6.93 seconds ± 2.37 seconds
* Average Accuracy: 0.8000 ± 0.0471
* Average Loss: 1.3277 ± 0.0903

In [None]:
# Define a function to calculate the angle between three points (A, B, C)
def calculate_angle(A, B, C):
    BA = A - B
    BC = C - B
    # Compute dot product and magnitudes
    dot_product = np.dot(BA, BC)
    magnitude_BA = np.linalg.norm(BA)
    magnitude_BC = np.linalg.norm(BC)
    # Prevent division by zero
    if magnitude_BA == 0 or magnitude_BC == 0:
        return 0.0
    # Calculate the cosine of the angle
    cos_angle = dot_product / (magnitude_BA * magnitude_BC)
    # Clip values to handle numerical errors
    cos_angle = np.clip(cos_angle, -1.0, 1.0)
    # Return the angle in radians
    return np.arccos(cos_angle)

In [None]:
def get_angles(X):
    # Reshape the data into (n_videos, n_frames, n_keypoints, 3)
    n_videos, n_frames, n_features = X.shape
    n_keypoints = 75
    X_reshaped = X.reshape(n_videos, n_frames, n_keypoints, 3)

    # Define keypoints for angle calculation (indices start from 0)
    pose_angle_indices = [
        (12, 14, 16),
        (14, 16, 18),
        (18, 16, 22),
        (14, 12, 24),

        (11, 13, 15),
        (13, 15, 17),
        (17, 15, 21),
        (13, 11, 23),
    ]
    # For both left and right hands
    hand_angle_indices = [(4, 0, 8),
                          (8, 0, 16),
                          (0, 9, 12),
                          (0, 17,20),
                          ]

    # Calculate angles for each video and frame
    angles_list = []
    for video in X_reshaped:
        video_angles = []
        for frame in video:
            frame_angles = []
            # Pose angles
            for (i, j, k) in pose_angle_indices:
                frame_angles.append(calculate_angle(frame[i], frame[j], frame[k]))
            # Left hand angles
            for (i, j, k) in hand_angle_indices:
                frame_angles.append(calculate_angle(frame[33 + i], frame[33 + j], frame[33 + k]))
            # Right hand angles
            for (i, j, k) in hand_angle_indices:
                frame_angles.append(calculate_angle(frame[54 + i], frame[54 + j], frame[54 + k]))
            video_angles.append(frame_angles)
        angles_list.append(video_angles)

    # Convert angles list to a numpy array
    angles_array = np.array(angles_list)  # Shape: (n_videos, n_frames, n_angles)
    return angles_array

In [None]:
# Get joint angles
X_train_angles = get_angles(X_train)
X_test_angles = get_angles(X_test)

In [None]:
def smooth_labels(y_true, num_classes, smoothing=0.1):
    # Convert to one-hot
    y_true_one_hot = tf.one_hot(y_true, depth=num_classes)
    # Apply smoothing
    smoothed_labels = y_true_one_hot * (1 - smoothing) + (smoothing / num_classes)
    return smoothed_labels

In [None]:
# Apply label smoothing to the training labels.
num_classes = 30 # Number of classes in dataset
label_smoothing = 0.1 # Smoothing parameter
y_train_smoothed = smooth_labels(y_train, num_classes, smoothing=label_smoothing)

# Convert y_test into one-hot format too, for consistency. We don't smooth the test labels.
y_test = tf.one_hot(y_test, depth=num_classes)

In [None]:
# Create normalization layer. Standardization is performed per frame (axis=-1).
normalization_layer = layers.Normalization(axis=-1)

# Adapt the normalization layer to the training data to calculate mean and std
normalization_layer.adapt(X_train)

# Standardize the data
X_train_standardized = normalization_layer(X_train).numpy()
X_test_standardized = normalization_layer(X_test).numpy()

# Add angle features to the standardized landmark data.
X_train = np.concatenate([X_train_standardized, X_train_angles],axis=-1)
X_test = np.concatenate([X_test_standardized, X_test_angles],axis=-1)

# Convert to TensorFlow tensor
X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_train = tf.convert_to_tensor(y_train_smoothed, dtype=tf.float32)
y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)

In [None]:
def create_transformer_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    # Positional Encoding
    positional_encoding = layers.Embedding(input_dim=input_shape[0], output_dim=input_shape[1])(tf.range(input_shape[0]))
    x = inputs + positional_encoding

    # Transformer Encoder
    for _ in range(4):  # Number of Transformer blocks
        # Normalized before attention, instead of after
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        # Instead of 4 (base), 8 used
        attention_output = layers.MultiHeadAttention(num_heads=8, key_dim=64)(x_norm, x_norm)
        x = x + attention_output
        # Instead of a single layer with 241 units, added 1 denser layer.
        ff_output = layers.Dense(512, activation='relu')(x)
        ff_output = layers.Dense(241, activation='relu')(x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ff_output)

    # Global Average Pooling
    x = layers.GlobalAveragePooling1D()(x)

    # Output Layer
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    return models.Model(inputs, outputs)

In [None]:
input_shape = (113, 241)  # (n_frames, n_keypoints * n_coordinates)
num_classes = 30
val_accuracy_scores = []
val_loss_scores = []
training_times = []
inference_times = []
all_predictions = []

# Run model training 3 times
for i in range(3):
    # Instantiate the model
    transformer_model = create_transformer_model(input_shape, num_classes)

    # Define weight decay
    weight_decay = 1e-4  # Can adjust this value (e.g., 1e-3, 5e-5)

    # Compile the model
    transformer_model.compile(
        optimizer=AdamW(learning_rate=0.001, weight_decay=weight_decay),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    # Add the learning rate scheduler callback
    lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6
    )
    # Add early stopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    # Combine Callbacks
    callbacks = [lr_callback, early_stopping]
    print(f"Training Run {i+1}")
    start_time = time.time()  # Start the timer

    history = transformer_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[callbacks]
    )

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    training_times.append(elapsed_time)
    print(f"Training Time Run {i+1}: {elapsed_time:.2f} seconds\n")

    print(f"Inference Run {i+1}")
    start_time = time.time()  # Start the timer

    predictions = transformer_model.predict(X_test)  # Replace x_test with your test data

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    inference_times.append(elapsed_time)
    print(f"Inference Time Run {i+1}: {elapsed_time:.2f} seconds\n")
    all_predictions.append(predictions)

    val_accuracy_scores.append(history.history['val_accuracy'][-1])
    val_loss_scores.append(history.history['val_loss'][-1])

Training Run 1
Epoch 1/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 470ms/step - accuracy: 0.2519 - loss: 2.9809 - val_accuracy: 0.4833 - val_loss: 2.1019 - learning_rate: 0.0010
Epoch 2/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.7784 - loss: 1.3041 - val_accuracy: 0.6333 - val_loss: 1.7323 - learning_rate: 0.0010
Epoch 3/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9338 - loss: 0.9447 - val_accuracy: 0.6667 - val_loss: 1.6458 - learning_rate: 0.0010
Epoch 4/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9493 - loss: 0.8747 - val_accuracy: 0.7500 - val_loss: 1.4285 - learning_rate: 0.0010
Epoch 5/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9766 - loss: 0.7877 - val_accuracy: 0.7667 - val_loss: 1.4207 - learning_rate: 0.0010
Epoch 6/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37



[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m2s[0m 2s/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step
Inference Time Run 3: 5.15 seconds



## Result

In [None]:
compute_and_print_metrics(training_times, inference_times, val_accuracy_scores, val_loss_scores)

Average Training Time: 56.12 seconds, Std Deviation 8.12 seconds
Average Inference Time: 6.93 seconds, Std Deviation 2.37 seconds
Average Accuracy: 0.8000, Std Dev: 0.0471
Average Loss: 1.3277, Std Dev: 0.0903


Misclassified Labels

In [None]:
# Display the DataFrame
misclassified_df = get_misclassified_dataframe(all_predictions, label_mapping, y_test)
print(misclassified_df)

    Test Index True Label Predicted Label       File Name
0            5     senyum           lihat   senyum_07.npy
1            8    sedikit           orang  sedikit_01.npy
2           14      orang            adik    orang_02.npy
3           15      orang            adik    orang_04.npy
4           22      makan           minum    makan_04.npy
5           28      lihat           makan    lihat_04.npy
6           34     kertas            main   kertas_01.npy
7           39      jalan        keluarga    jalan_09.npy
8           50       anak           orang     anak_03.npy
9           56     dengar             ibu   dengar_02.npy
10          57     dengar             ibu   dengar_04.npy


# Model B
Model B builds upon the architecture and preprocessing steps of Model A but incorporates hyperparameter tuning using Keras Tuner. This automated search explored different combinations of hyperparameters to optimize model performance. The tuning process and the resulting best hyperparameters are detailed below.

The final Model B achieved the following results:

* Average Training Time: 85.06 seconds ± 16.41 seconds
* Average Inference Time: 6.86 seconds ± 2.41 seconds
* Average Accuracy: 0.8333 ± 0.0136
* Average Loss: 1.1957 ± 0.0115

## Hyperparameter Tuning

In [None]:
pip install -q keras-tuner

In [None]:
import keras_tuner as kt
from tensorflow.keras import layers, models
import tensorflow as tf
from tensorflow.keras.optimizers import AdamW

# Define the HyperModel
def build_transformer_model(hp):
    input_shape = (113, 241)  # (n_frames, n_keypoints * n_coordinates + n_angles)
    num_classes = 30

    inputs = layers.Input(shape=input_shape)

    # Positional Encoding
    positional_encoding = layers.Embedding(input_dim=input_shape[0], output_dim=input_shape[1])(tf.range(input_shape[0]))
    x = inputs + positional_encoding

    # Hyperparameter tuning for number of Transformer blocks
    for _ in range(hp.Int("num_blocks", 2, 6, step=1)):  # 2 to 6 blocks
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        attention_heads = hp.Choice("num_heads", [4, 8, 12])  # Choose between 4, 8, 12 heads
        attention_output = layers.MultiHeadAttention(
            num_heads=attention_heads,
            key_dim=hp.Choice("key_dim", [32, 64, 128]))(x_norm, x_norm)
        x = x + attention_output

        # Feed Forward Network
        ff_units = hp.Int("ff_units", min_value=128, max_value=512, step=128)  # Units in FF layers
        ff_output = layers.Dense(ff_units, activation='relu')(x)
        ff_output = layers.Dense(241, activation='relu')(ff_output)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ff_output)

    # Global Average Pooling
    x = layers.GlobalAveragePooling1D()(x)

    # Output Layer
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)

    # Define optimizer with weight decay
    weight_decay = hp.Choice("weight_decay", [1e-4, 5e-5, 1e-5])
    learning_rate = hp.Choice("learning_rate", [1e-2, 1e-3, 1e-4])
    optimizer = AdamW(learning_rate=learning_rate, weight_decay=weight_decay)

    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    return model


# Instantiate the tuner
tuner = kt.BayesianOptimization(
    build_transformer_model,
    objective="val_accuracy",
    max_trials=10,  # Number of combinations to try
    directory="transformer_tuning",  # Directory to save results
    project_name="hyperparam_tuning"
)

# Perform hyperparameter search
tuner.search(
    X_train, y_train,
    epochs=20,
    validation_data=(X_test, y_test),
    batch_size=32,
)

# Get the best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:")
print(f"Number of Transformer Blocks: {best_hps.get('num_blocks')}")
print(f"Number of Attention Heads: {best_hps.get('num_heads')}")
print(f"Key Dimension: {best_hps.get('key_dim')}")
print(f"Feed Forward Units: {best_hps.get('ff_units')}")
print(f"Learning Rate: {best_hps.get('learning_rate')}")
print(f"Weight Decay: {best_hps.get('weight_decay')}")

# Train the best model
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(
    X_train, y_train,
    epochs=30,
    validation_data=(X_test, y_test),
    batch_size=32
)


Trial 10 Complete [00h 01m 42s]
val_accuracy: 0.8666666746139526

Best val_accuracy So Far: 0.8666666746139526
Total elapsed time: 00h 14m 18s
Best Hyperparameters:
Number of Transformer Blocks: 4
Number of Attention Heads: 8
Key Dimension: 128
Feed Forward Units: 512
Learning Rate: 0.001
Weight Decay: 0.0001
Epoch 1/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 410ms/step - accuracy: 0.1863 - loss: 3.4616 - val_accuracy: 0.3833 - val_loss: 2.3189
Epoch 2/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - accuracy: 0.5221 - loss: 1.9046 - val_accuracy: 0.3667 - val_loss: 2.1475
Epoch 3/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.7263 - loss: 1.3797 - val_accuracy: 0.5667 - val_loss: 1.8893
Epoch 4/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 0.8264 - loss: 1.1513 - val_accuracy: 0.5667 - val_loss: 1.5722
Epoch 5/30
[1m38/38[0m [32m━━━━━━━━━━━━━

## Model Building with Best Parameter

Best Hyperparameters:
* Number of Transformer Blocks: 4
* Number of Attention Heads: 8
* Key Dimension: 128
* Feed Forward Units: 512
* Learning Rate: 0.001
* Weight Decay: 0.0001

The only hyperparameter change in Model B compared to Model A was the key dimension, which was increased from 64 to 128.

In [None]:
def create_transformer_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    # Positional Encoding
    positional_encoding = layers.Embedding(input_dim=input_shape[0], output_dim=input_shape[1])(tf.range(input_shape[0]))
    x = inputs + positional_encoding

    # Transformer Encoder
    for _ in range(4):  # Number of Transformer blocks
        # Layer Normalization (applied before attention, based on prior experimentation)
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        # Multi-Head Attention (8 heads, increased key dimension to 128 from base 64)
        attention_output = layers.MultiHeadAttention(num_heads=8, key_dim=128)(x_norm, x_norm)
        x = x + attention_output
        # Feed-Forward Network (added an extra dense layer with 512 units based on prior experimentation)
        ff_output = layers.Dense(512, activation='relu')(x)
        ff_output = layers.Dense(241, activation='relu')(x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ff_output)

    # Global Average Pooling
    x = layers.GlobalAveragePooling1D()(x)

    # Output Layer
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    return models.Model(inputs, outputs)

In [None]:
input_shape = (113, 241)  # (n_frames, n_keypoints * n_coordinates)
num_classes = 30
val_accuracy_scores = []
val_loss_scores = []
training_times = []
inference_times = []
all_predictions = []

# Run model training 3 times
for i in range(3):
    # Instantiate the model
    transformer_model = create_transformer_model(input_shape, num_classes)

    # Define weight decay
    weight_decay = 1e-4

    # Compile the model
    transformer_model.compile(
        optimizer=AdamW(learning_rate=0.001, weight_decay=weight_decay),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    # Add the learning rate scheduler callback
    lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6
    )
    # Add early stopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    # Combine Callbacks
    callbacks = [lr_callback, early_stopping]
    print(f"Training Run {i+1}")
    start_time = time.time()  # Start the timer

    history = transformer_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[callbacks]
    )

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    training_times.append(elapsed_time)
    print(f"Training Time Run {i+1}: {elapsed_time:.2f} seconds\n")

    print(f"Inference Run {i+1}")
    start_time = time.time()  # Start the timer

    predictions = transformer_model.predict(X_test)  # Replace x_test with your test data

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    inference_times.append(elapsed_time)
    print(f"Inference Time Run {i+1}: {elapsed_time:.2f} seconds\n")
    all_predictions.append(predictions)

    val_accuracy_scores.append(history.history['val_accuracy'][-1])
    val_loss_scores.append(history.history['val_loss'][-1])

Training Run 1
Epoch 1/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 357ms/step - accuracy: 0.2308 - loss: 3.1903 - val_accuracy: 0.4500 - val_loss: 2.2628 - learning_rate: 0.0010
Epoch 2/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 49ms/step - accuracy: 0.5944 - loss: 1.7017 - val_accuracy: 0.5000 - val_loss: 1.8774 - learning_rate: 0.0010
Epoch 3/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.8134 - loss: 1.2227 - val_accuracy: 0.5167 - val_loss: 1.7587 - learning_rate: 0.0010
Epoch 4/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.9252 - loss: 0.9944 - val_accuracy: 0.5667 - val_loss: 1.7356 - learning_rate: 0.0010
Epoch 5/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.8812 - loss: 1.1006 - val_accuracy: 0.6833 - val_loss: 1.6056 - learning_rate: 0.0010
Epoch 6/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3



[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 2s/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3s/step
Inference Time Run 3: 5.15 seconds



## Result

In [None]:
compute_and_print_metrics(training_times, inference_times, val_accuracy_scores, val_loss_scores)

Average Training Time: 85.06 seconds, Std Deviation 16.41 seconds
Average Inference Time: 6.86 seconds, Std Deviation 2.41 seconds
Average Accuracy: 0.8333, Std Dev: 0.0136
Average Loss: 1.1957, Std Dev: 0.0115


Misclassified Labels

In [None]:
# Display the DataFrame
misclassified_df = get_misclassified_dataframe(all_predictions, label_mapping, y_test)
print(misclassified_df)

   Test Index True Label Predicted Label       File Name
0           8    sedikit           orang  sedikit_01.npy
1          22      makan           minum    makan_04.npy
2          28      lihat           makan    lihat_04.npy
3          34     kertas            main   kertas_01.npy
4          49    gembira            buka  gembira_10.npy
5          50       anak           orang     anak_03.npy
6          57     dengar             ibu   dengar_04.npy


# Model C
In contrast to the modifications explored in Model A and the hyperparameter tuning performed for Model B, Model C returned to the base model architecture. However, it retained the improved preprocessing steps implemented in Model A (addition of keypoint angles, coordinate standardization, and label smoothing). This approach was motivated by previous experiments suggesting that these preprocessing steps provided a significant performance boost independent of other architectural or training modifications.

The Model C achieved the following results:

* Average Training Time: 41.81 seconds ± 3.74 seconds
* Average Inference Time: 5.16 seconds ± 0.00 seconds
* Average Accuracy: 0.8000 ± 0.0471
* Average Loss: 1.3508 ± 0.0349

In [None]:
def create_transformer_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    # Positional Encoding
    positional_encoding = layers.Embedding(input_dim=input_shape[0], output_dim=input_shape[1])(tf.range(input_shape[0]))
    x = inputs + positional_encoding

    # Transformer Encoder
    for _ in range(4):  # Number of Transformer blocks
        attention_output = layers.MultiHeadAttention(num_heads=4, key_dim=64)(x, x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attention_output)
        ff_output = layers.Dense(241, activation='relu')(x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ff_output)

    # Global Average Pooling
    x = layers.GlobalAveragePooling1D()(x)

    # Output Layer
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    return models.Model(inputs, outputs)

In [None]:
input_shape = (113, 241)  # (n_frames, n_keypoints * n_coordinates)
num_classes = 30
val_accuracy_scores = []
val_loss_scores = []
training_times = []
inference_times = []
all_predictions = []

# Run model training 3 times
for i in range(3):
    # Instantiate the model
    transformer_model = create_transformer_model(input_shape, num_classes)

    transformer_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=['accuracy']
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    print(f"Training Run {i+1}")
    start_time = time.time()  # Start the timer

    history = transformer_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=30,  # Replace with your actual number of epochs
        batch_size=32,  # Replace with your batch size
        callbacks=early_stopping
    )

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    training_times.append(elapsed_time)
    print(f"Training Time Run {i+1}: {elapsed_time:.2f} seconds\n")

    print(f"Inference Run {i+1}")
    start_time = time.time()  # Start the timer

    predictions = transformer_model.predict(X_test)  # Replace x_test with your test data

    end_time = time.time()  # End the timer
    elapsed_time = end_time - start_time
    inference_times.append(elapsed_time)
    print(f"Inference Time Run {i+1}: {elapsed_time:.2f} seconds\n")
    all_predictions.append(predictions)

    val_accuracy_scores.append(history.history['val_accuracy'][-1])
    val_loss_scores.append(history.history['val_loss'][-1])

Training Run 1
Epoch 1/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 325ms/step - accuracy: 0.2979 - loss: 2.8097 - val_accuracy: 0.5500 - val_loss: 1.8318
Epoch 2/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.8552 - loss: 1.1687 - val_accuracy: 0.7167 - val_loss: 1.5285
Epoch 3/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9693 - loss: 0.8681 - val_accuracy: 0.7500 - val_loss: 1.4477
Epoch 4/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9602 - loss: 0.8749 - val_accuracy: 0.8333 - val_loss: 1.3378
Epoch 5/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9641 - loss: 0.8244 - val_accuracy: 0.7333 - val_loss: 1.5243
Epoch 6/30
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9992 - loss: 0.7461 - val_accuracy: 0.8500 - val_loss: 1.3971
Epoch 7/30
[1m38

## Result

In [None]:
compute_and_print_metrics(training_times, inference_times, val_accuracy_scores, val_loss_scores)

Average Training Time: 41.81 seconds, Std Deviation 3.74 seconds
Average Inference Time: 5.16 seconds, Std Deviation 0.00 seconds
Average Accuracy: 0.8000, Std Dev: 0.0471
Average Loss: 1.3508, Std Dev: 0.0349


Misclassified Labels

In [None]:
# Display the DataFrame
misclassified_df = get_misclassified_dataframe(all_predictions, label_mapping, y_test)
print(misclassified_df)

    Test Index True Label Predicted Label       File Name
0            8    sedikit           orang  sedikit_01.npy
1           14      orang           jalan    orang_02.npy
2           15      orang            adik    orang_04.npy
3           22      makan           minum    makan_04.npy
4           28      lihat           makan    lihat_04.npy
5           29      lihat            haus    lihat_07.npy
6           34     kertas            main   kertas_01.npy
7           39      jalan        keluarga    jalan_09.npy
8           50       anak        keluarga     anak_03.npy
9           56     dengar             ibu   dengar_02.npy
10          57     dengar             ibu   dengar_04.npy


In [None]:
!pip install -q dagshub

from dagshub.notebook import save_notebook

save_notebook(repo="Omdena/JakartaIndonesia_SignLanguageTranslation", path="modeling", branch="kenji_modeling", commit_message="Add finalized Model Selection and Comparison notebook")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.6/255.6 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.2/203.2 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.2/83.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.0/74.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h