#### FA23 DAAN570 - Deep Learning
#### Project: UAV Control using CNN and ViT Gesture Recognition
#### Model: Model 10 (vit-base-patch16-224-in21k)
##### Students: Aureo Zanon and Johnny Zielinski (Team 15)
##### Date: December 6th, 2023

In [None]:
# Loading the libraries

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import time
import random
from PIL import Image
import albumentations as A
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
import matplotlib.cm as cm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import (Layer, GlobalAveragePooling2D, Activation, MaxPooling2D, Add, Conv2D, 
                                     MaxPool2D, Dense, Flatten, InputLayer, BatchNormalization, Input, 
                                     Embedding, Permute, Dropout, RandomFlip, RandomRotation, LayerNormalization, 
                                     MultiHeadAttention, RandomContrast, Rescaling, Resizing, Reshape, LeakyReLU)
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers.legacy import Adam, SGD
from tensorflow.keras.callbacks import (Callback, CSVLogger, EarlyStopping, LearningRateScheduler,
                                        ModelCheckpoint, ReduceLROnPlateau)
from tensorflow.keras.regularizers import L2, L1
from tensorflow.train import BytesList, FloatList, Int64List, Example, Features, Feature
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import VGG16, VGG19, EfficientNetB0
from transformers import ViTFeatureExtractor, ViTForImageClassification,  MobileViTForImageClassification, MobileViTModel, MobileViTConfig, MobileViTFeatureExtractor, TFViTModel
import requests
from keras.applications import imagenet_utils
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras.models import Model



In [None]:
# ViT Model Configuration settings

CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 224,
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 20,
    "DROPOUT_RATE": 0.0,
    "REGULARIZATION_RATE": 0.0,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 9,
    "PATCH_SIZE": 32,
    "PROJ_DIM": 768,
    "CLASS_NAMES": ["up", "pointer_r", "pointer_l", 
    "pointer_f", "palm_u", "palm_m", "palm", "ele", "down"],
    
}

# "NUM_CLASSES_ORIGINAL: 12,
# "CLASS_NAMES_ORIGINAL": ["up", "pointer_r", "pointer_l", 
# "pointer_f", "pointer_b", "pointer", "palm_u", 
# "palm_o", "palm_m", "palm", "ele", "down"],



In [None]:
# Setting the base path to the datasets

base_path = '//Users//aureozanon//Documents//DAAN570//Project//HG_Data//'

In [None]:
# Loading the datasets

train_datagen = ImageDataGenerator(rescale=1./255, 
                                   rotation_range=20, 
                                   width_shift_range=0.2, 
                                   height_shift_range=0.2, 
                                   shear_range=0.2, 
                                   zoom_range=0.2, 
                                   horizontal_flip=True, 
                                   fill_mode='nearest',
                                   validation_split=0.1)  # 10% for validation due to the large dataset


In [None]:
#Creating the Training and Validation generators

train_generator = train_datagen.flow_from_directory(
    base_path, 
    target_size=(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]), 
    batch_size=CONFIGURATION["BATCH_SIZE"], 
    class_mode='categorical',
    subset='training',
    shuffle=True,
    seed=74,
    )

validation_generator = train_datagen.flow_from_directory(
    base_path, 
    target_size=(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]), 
    batch_size=CONFIGURATION["BATCH_SIZE"], 
    class_mode='categorical',
    subset='validation',
    shuffle=True,
    seed=74,
    )

In [None]:
# Data Augmentation

### tf.keras.layer augment
augment_layers = tf.keras.Sequential([
  RandomRotation(factor = (-0.025, 0.025)),
  RandomFlip(mode='horizontal',),
  RandomContrast(factor=0.1),
])

def augment_layer(image, label):
  return augment_layers(image, training = True), label



In [None]:
# Cutmix Augmentation Exploration

def box(lamda):

  r_x = tf.cast(tfp.distributions.Uniform(0, CONFIGURATION["IM_SIZE"]).sample(1)[0], dtype = tf.int32)
  r_y = tf.cast(tfp.distributions.Uniform(0, CONFIGURATION["IM_SIZE"]).sample(1)[0], dtype = tf.int32)

  r_w = tf.cast(CONFIGURATION["IM_SIZE"]*tf.math.sqrt(1-lamda), dtype = tf.int32)
  r_h = tf.cast(CONFIGURATION["IM_SIZE"]*tf.math.sqrt(1-lamda), dtype = tf.int32)

  r_x = tf.clip_by_value(r_x - r_w//2, 0, CONFIGURATION["IM_SIZE"])
  r_y = tf.clip_by_value(r_y - r_h//2, 0, CONFIGURATION["IM_SIZE"])

  x_b_r = tf.clip_by_value(r_x + r_w//2, 0, CONFIGURATION["IM_SIZE"])
  y_b_r = tf.clip_by_value(r_y + r_h//2, 0, CONFIGURATION["IM_SIZE"])

  r_w = x_b_r - r_x
  if(r_w == 0):
    r_w  = 1

  r_h = y_b_r - r_y
  if(r_h == 0):
    r_h = 1

  return r_y, r_x, r_h, r_w



def cutmix(train_dataset_1, train_dataset_2):
  (image_1,label_1), (image_2, label_2) = train_dataset_1, train_dataset_2

  lamda = tfp.distributions.Beta(2,2)
  lamda = lamda.sample(1)[0]

  r_y, r_x, r_h, r_w = box(lamda)
  crop_2 = tf.image.crop_to_bounding_box(image_2, r_y, r_x, r_h, r_w)
  pad_2 = tf.image.pad_to_bounding_box(crop_2, r_y, r_x, CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"])

  crop_1 = tf.image.crop_to_bounding_box(image_1, r_y, r_x, r_h, r_w)
  pad_1 = tf.image.pad_to_bounding_box(crop_1, r_y, r_x, CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"])

  image = image_1 - pad_1 + pad_2

  lamda = tf.cast(1- (r_w*r_h)/(CONFIGURATION["IM_SIZE"]*CONFIGURATION["IM_SIZE"]), dtype = tf.float32)
  label = lamda*tf.cast(label_1, dtype = tf.float32) + (1-lamda)*tf.cast(label_2, dtype = tf.float32)

  return image, label


In [None]:
# Resizing and rescaling
resize_rescale_hf = tf.keras.Sequential([
       Resizing(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]),
       Rescaling(1./255),
       Permute((3,1,2))
])


# Vision Transformer Model




# HuggingFace ViT

In [None]:
# Building the base model

from transformers import ViTFeatureExtractor, TFViTModel


base_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
# feature_extractor = MobileViTFeatureExtractor.from_pretrained('Matthijs/mobilevit-small')
# base_model = MobileViTForImageClassification.from_pretrained('Matthijs/mobilevit-small')

inputs = Input(shape = (CONFIGURATION["IM_SIZE"],CONFIGURATION["IM_SIZE"],3))
x = resize_rescale_hf(inputs)
x = base_model.vit(x)[0][:,0,:]
#print(x)
output = Dense(CONFIGURATION["NUM_CLASSES"], activation = 'softmax')(x)

hf_model = tf.keras.Model(inputs=inputs, outputs=output)

In [None]:
# Getting Attention Maps

from transformers import ViTFeatureExtractor, TFViTModel, ViTConfig

configuration = ViTConfig()
configuration.output_attentions = True

base_model = TFViTModel.from_pretrained(
    pretrained_model_name_or_path = "google/vit-base-patch16-224-in21k",
    config = configuration,
    )
inputs = Input(shape = (CONFIGURATION["IM_SIZE"],CONFIGURATION["IM_SIZE"],3))
x = resize_rescale_hf(inputs)
x = base_model.vit(x)['attentions']

model = tf.keras.Model(inputs=inputs, outputs=x)


In [None]:
# Settings to train the HF model

loss_function = CategoricalCrossentropy()
#loss_function = SparseCategoricalCrossentropy()



In [None]:
plot_model(model, to_file='ViT_Model9_AZ.png', show_shapes=True, show_layer_names=True)

In [None]:
# Compiling the model
model.compile(optimizer=SGD(learning_rate=0.001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

# model compiled using the SGD optimizer, with learning rate of 0.001 and momentum of 0.9 (tried other combinations in previous models)
# used the categorical_crossentropy loss and set accuracy as the metrics.

In [None]:
# Training the new model
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=10
)