# Temporal Multi-Channel Vision Transformer (TMC-ViT)

**Author**: Ricardo V. Godoy <br>
**Description**: This project implements a Transformer-based model called Temporal Multi-Channel Vision Transformer (TMC-ViT). The TMC-ViT was developed to adapt the Vision Transformer model proposed by [Dosovitskiy et al.](http://arxiv.org/abs/2010.11929) for processing multi-channel temporal signals as input. In this example, we will predict 18 gestures from the [Ninapro DB05
Database](https://doi.org/10.1371/journal.pone.0186132). <br>
**Requirements**: This model is developed in Keras 2.8.0 and Python 3.7.x.

### Import libraries


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout, Conv2D, MaxPooling2D, BatchNormalization

### Load the data
The input data must already be divided into training and test sets, with 200 ms samples. Use one separate repetition for testing. More information on the data preprocessing can be found in [Electromyography-Based, Robust Hand Motion Classification Employing Temporal Multi-Channel Vision Transformers](https://ieeexplore.ieee.org/document/9834070).


In [None]:
# Load the inputs
X_test = np.load('./X_test.npy')
X_train = np.load('./X_train.npy')
y_test = np.load('./Y_test.npy')
y_train = np.load('./Y_train.npy')

# Prepare the data
X_train = X_train.reshape(-1, 16, 40, 1)
X_test = X_test.reshape(-1, 16, 40, 1)

### Define the model parameters

In [None]:
num_classes = 18 # Number of output gestures (17 gestures + rest gesture)
input_shape = ([X_train.shape[1], X_train.shape[2], 1]) # Input shape
image_size1 = 16 # Number of emg channels
image_size2= 20 # Number of time steps. Will be resized to this value
patch_size = 4 # Size of each patch. In this case, 4x4
num_patches = (image_size1 // patch_size) * (image_size2 // patch_size) # Number of patches
projection_dim = 64 # Output dimension of all sub-layers in the enconder, as well as the embedding layers
num_heads = 4 # Number of Multi-Head Attention
transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # Size of the transformer layers
transformer_layers = 8 # Number of transformer blocks
mlp_head_units = [2048, 1024]  # Size of the dense layers of the final classifier

### Define the MLP used at the end of the TMC-ViT model

In [None]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

### Define the patch class, which extracts patches from the input signal

In [None]:
class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

### Define the patch encoder
In this step, the patches are linearly projected and the postition embedding is added.

In [None]:
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

### Create the TMC-ViT classifier

In [None]:
def create_tmc_vit_classifier(input_shape, token_emb, patch_size, num_patches,
        projection_dim, transformer_layers, num_heads, transformer_units,
        mlp_head_units, num_classes):
    inputs = layers.Input(shape=input_shape)
    # Token embedding.
    tokenemb = token_emb(inputs)
    # Create patches.
    patches = Patches(patch_size)(tokenemb)
    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # Add MLP.
    features = mlp(representation, hidden_units=mlp_head_units, 
            dropout_rate=0.5)
    # Classify outputs.
    logits = layers.Dense(num_classes, activation="softmax")(features)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

### Create token embedding
The TMC-ViT employs convoltional blocks composed of convolutional, batch normalization, max-pooling, and dropout layers to both reduce the input dimension and extract the embeddings.

In [None]:
token_emb = keras.Sequential(
    [
        Conv2D(16, (8, 8), activation="relu", padding="same", 
                input_shape=[X_train.shape[1], X_train.shape[2], 1]),
        BatchNormalization(),
        MaxPooling2D((1, 2)),
        Dropout(0.3),
        Conv2D(32, (4, 4), activation="relu", padding="same"),
        BatchNormalization(),
        Dropout(0.3),
        Conv2D(projection_dim, (2, 2), activation="relu", padding="same"),
        BatchNormalization(),
    ],
    name="token_emb",
)

### Create the classifier

In [None]:
model = create_tmc_vit_classifier(input_shape, token_emb, patch_size, 
        num_patches, projection_dim, transformer_layers, num_heads, 
        transformer_units, mlp_head_units, num_classes)

### Compile the model

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
        metrics=['accuracy'])

### Define callback 
In this case, we will be using early stop

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
        min_delta=0, patience=70, mode='max', restore_best_weights=True)

### Train the model

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=500, 
        verbose = 1, validation_data=(X_test, y_test),callbacks=[callback])

### Evaluate the model

In [None]:
model.evaluate(X_test, y_test)