## Vision Transformer
#### Baseline model

In [1]:
# Loading the required libraries
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [None]:
# Loading and preprocessing the dataset
def load_images(image_path, target_size=(224, 224)):
    images, labels = [], []
    for label in os.listdir(image_path):
        for image_file in os.listdir(os.path.join(image_path, label)):
            image = keras.preprocessing.image.load_img(os.path.join(image_path, label, image_file), target_size=target_size)
            image = keras.preprocessing.image.img_to_array(image)
            images.append(image)
            labels.append(label)
    return np.array(images), np.array(labels)

In [None]:
# Defining the path to the images directory
images_path = 'path/to/hand_gestures'
images, labels = load_images(images_path)

In [None]:
# Normalizing the images
images /= 255.0

In [None]:
# Splitting the dataset into training and testing sets
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.1, random_state=42)


In [None]:
# Converting labels to one-hot encoding
num_classes = len(np.unique(train_labels))
train_labels = keras.utils.to_categorical(train_labels, num_classes)
test_labels = keras.utils.to_categorical(test_labels, num_classes)

In [None]:
# Building the Vision Transformer model

def build_vit_model(image_size, num_classes):
    inputs = layers.Input(shape=(image_size, image_size, 3))
    # Augment data
    augmented = layers.experimental.preprocessing.Rescaling(1./255)(inputs)
    augmented = layers.experimental.preprocessing.Normalization()(augmented)

    # Creating patches
    patches = Patches(patch_size)(augmented)
    # Encode patches
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Creating multiple transformer layers
    for _ in range(transformer_layers):
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1)(x1, x1)
        x2 = layers.Add()([attention_output, encoded_patches])
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        encoded_patches = layers.Add()([x3, x2])

    # Creating a [batch_size, projection_dim] tensor
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)

    # Adding MLP
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)

    # Classifying outputs
    outputs = layers.Dense(num_classes)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

# Instantiating the model
model = build_vit_model(image_size=224, num_classes=10) # Specify the correct image size and number of classes

# Compiling the model
model.compile(optimizer=Adam(learning_rate=1e-4), 
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

# Defining callbacks (e.g., early stopping, model checkpointing)
callbacks = [
    keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
]

# Summary of the model to verify the architecture
model.summary()

In [None]:
# Training the model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=100, callbacks=callbacks)


In [None]:
# Evaluating the model performance
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")


In [None]:
# Saving the model, if needed
model.save('vit_model.h5')