<a href="https://colab.research.google.com/github/MorningStarTM/Transformers-in-Vision/blob/main/ViT_for_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install patchify

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting patchify
  Downloading patchify-0.2.3-py3-none-any.whl (6.6 kB)
Installing collected packages: patchify
Successfully installed patchify-0.2.3


In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2
import random
from glob import glob
import tensorflow as tf
from sklearn.utils import shuffle
from patchify import patchify
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
#Hyperparameter
hp = {}
hp['image_size'] = 200
hp['num_channel'] = 3
hp['patch_size'] = 25
hp['num_patches'] = (hp['image_size']**2) // (hp['patch_size']**2)
hp['flat_patches_shape'] = (hp['num_patches'], hp['patch_size']*hp['patch_size']*hp['num_channel'])

hp['batch_size'] = 32
hp['lr'] = 1e-4
hp['num_epochs'] = 500
hp['num_classes'] = 9
hp['class_names'] = ["Ant", "Butterfly", "Cockroach", "Frog", "Grasshopper", "Honey bee", "Spider", "dragonfly", "lizard"]

In [None]:
#!unzip "/content/drive/MyDrive/archive podiwije.zip" -d "/content/drive/MyDrive/DataSet/Insects/"

In [5]:
dataset_path = "/content/drive/MyDrive/DataSet/Insects/Reptiles-Insects"

In [6]:
def create_dir(path):
  if not os.path.exists(path):
    os.makedirs(path)

In [7]:
#function for load the data file
def load_data(path, split=0.1):
  images = shuffle(glob(os.path.join(path, "*", "*.jpg")))
  
  split_size = int(len(images) * split)
  #split the data 
  train_x, valid_x = train_test_split(images, test_size=split_size, random_state=42)
  train_x, test_x = train_test_split(train_x, test_size=split_size, random_state=42)

  return train_x, valid_x, test_x

In [8]:
def process_image_label(path):
  path = path.decode()
  #read image
  image = cv2.imread(path, cv2.IMREAD_COLOR)
  #resize the image
  image = cv2.resize(image, (hp['image_size'], hp['image_size']))
  #scale the image
  image = image/255.0
  print(image.shape)

  #image into patch
  patch_shape = (hp['patch_size'], hp['patch_size'], hp['num_channel'])
  patches = patchify(image, patch_shape, hp['patch_size'])

  patches = np.reshape(patches, hp['flat_patches_shape'])
  patches = patches.astype(np.float32)
  print(path)

  #labeling
  class_name = path.split('/')[-2]
  class_idx = hp['class_names'].index(class_name)
  class_idx = np.array(class_idx, dtype=np.int32)
  print(class_idx)

  return patches, class_idx

In [9]:
#we used opencv to read images not tensorflow. so we need to use tf.numpy_function to use these function in tensorflow
def parse(path):
  patches, labels = tf.numpy_function(process_image_label, [path], [tf.float32, tf.int32])
  labels = tf.one_hot(labels, hp['num_classes'])

  patches.set_shape(hp['flat_patches_shape'])
  labels.set_shape(hp['num_classes'])

  return patches, labels

In [10]:
def tf_dataset(images, batch=32):
  dataset = tf.data.Dataset.from_tensor_slices((images))
  dataset = dataset.map(parse).batch(batch).prefetch(8)
  return dataset

In [11]:
train_x, valid_x, test_x = load_data(dataset_path)

In [12]:
print(f"Train: {len(train_x)} Valid: {len(valid_x)} Test: {len(test_x)}")

Train: 680 Valid: 85 Test: 85


In [13]:
train_dataset = tf_dataset(train_x, batch=hp['batch_size'])
valid_dataset = tf_dataset(valid_x, batch=hp['batch_size'])

#Model

##Transformer Encoder

In [31]:
def mlp(inputs, config):
  inputs = Dense(config["mlp_dim"], activation='gelu')(inputs)
  inputs = Dropout(config["dropout_rate"])(inputs)
  inputs = Dense(config["hidden_dim"])(inputs)
  inputs = Dropout(config["dropout_rate"])(inputs)
  return inputs  

In [32]:
def transformer_encoder(inputs, config):
  skip_1 = inputs
  inputs = LayerNormalization()(inputs)
  inputs = MultiHeadAttention(
      num_heads=config["num_heads"],
      key_dim=config["hidden_dim"]
  )(inputs, inputs)
  inputs = Add()([inputs, skip_1])

  skip_2 = inputs
  inputs = LayerNormalization()(inputs)
  inputs = mlp(inputs, config)
  inputs = Add()([inputs, skip_2])

  return inputs

##Embedding

In [37]:
#Configuratin parameters
config = {}
config['num_layers'] = 12
config['hidden_dim'] = 768
config['mlp_dim'] = 3072
config['num_heads'] = 12
config['dropout_rate'] = 0.1
config['num_patches'] = 256
config['patch_size'] = 32
config['num_channels'] = 3
config["num_classes"] = 9

In [33]:
class ClassToken(Layer):
  def __init__(self):
    super().__init__()

  def build(self, input_shape):
    w_init = tf.random_normal_initializer()
    self.w = tf.Variable(
        initial_value = w_init(shape=(1, 1, input_shape[-1]), dtype=tf.float32), 
        trainable = True
    )

  def call(self, inputs):
    batch_size = tf.shape(inputs)[0]
    hidden_dim = self.w.shape[-1]

    #reshape
    cls = tf.broadcast_to(self.w, [batch_size, 1, hidden_dim])
    #change data type
    cls = tf.cast(cls, dtype=inputs.dtype)
    return cls

In [39]:
def ViT(config):
  #input layer
  input_shape = (config['num_patches'], config['patch_size']*config['patch_size']*config['num_channels'])
  inputs = Input(input_shape)
  
  #patch and position embedding
  patch_embedding = Dense(config['hidden_dim'])(inputs)
  
  positions = tf.range(start=0, limit=config['num_patches'], delta=1)
  position_embedding = Embedding(input_dim=config['num_patches'], output_dim=config['hidden_dim'])(positions)
  embed = patch_embedding + position_embedding
  
  #Adding class token
  token = ClassToken()(embed)
  concate = Concatenate(axis=1)([token, embed])
  for _ in range(config["num_layers"]):
    concate = transformer_encoder(concate, config)
  
  #Classification Head
  mlp_head_input = LayerNormalization()(concate)
  mlp_head_input = mlp_head_input[:, 0, :]
  mlp_head_input = Dense(config["num_classes"], activation="softmax")(mlp_head_input)

  model = Model(inputs, mlp_head_input)
  return model

In [40]:
model = ViT(config)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 256, 3072)]  0           []                               
                                                                                                  
 dense_56 (Dense)               (None, 256, 768)     2360064     ['input_8[0][0]']                
                                                                                                  
 tf.__operators__.add_7 (TFOpLa  (None, 256, 768)    0           ['dense_56[0][0]']               
 mbda)                                                                                            
                                                                                                  
 class_token_6 (ClassToken)     (None, 1, 768)       768         ['tf.__operators__.add_7[0][0