In [None]:
#at first import the library
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

In [None]:
image = tf.keras.preprocessing.image.load_img("portrait.jpg" , target_size = (144,144))
image

In [None]:
imageArray = tf.keras.preprocessing.image.img_to_array(image)
print(imageArray.shape)

In [None]:
#Because we need to have dimension in every patch so we add one column to our matrix
imageArray = imageArray[tf.newaxis , ...]
print(imageArray.shape)

In [None]:
#We need to make some patches
patches = tf.image.extract_patches(imageArray , sizes = [1,16,16,1] , strides = [1,16,16,1], rates = [1,1,1,1], padding = "VALID" )
print(patches.shape)

In [None]:
#Flatten the patches
patches = tf.reshape(patches , shape=(tf.shape(patches)[0] , -1 , 16*16*3))

In [None]:
print(patches.shape)


In [None]:
#Divide the image to some patches
n = int(np.sqrt(patches.shape[1]))
plt.figure(figsize=(n*n,1))
for i, patch in enumerate(patches[0]):
    ax = plt.subplot(1,n*n, i + 1)
    patch_img = tf.reshape(patch, (16 , 16 , 3))
    plt.imshow(patch_img.numpy().astype("uint8"))
    plt.axis("off")


In [None]:
#DEFINING A CLASS TO DO PATCH EMBEDDING AUTOMATICALLY
class PatchEmbedding(tf.keras.layers.Layer):
  def __init__(self, size , num_of_patches , projection_dim):
    super().__init__()
    self.size = size
    #we add +1 because ClS has a position for himself
    self.num_of_patches = num_of_patches + 1 
    self.projection_dim = projection_dim
    self.projection = tf.keras.layers.Dense(projection_dim)
    self.clsToken = tf.Variable(tf.keras.initializers.GlorotNormal()(shape = (1,1,projection_dim)) , trainable = True)
    self.positionalEmbedding = tf.keras.layers.Embedding(self.num_of_patches , projection_dim)


  def call(self , inputs):
    #extracting patches
    patches = tf.image.extract_patches(inputs, sizes=[1 , self.size, self.size , 1] , strides = [1, self.size , self.size , 1] , rates = [1 , 1 , 1 , 1] , padding = "VALID")
    #make 1D patches. we know that the image is color
    #if we don't know, we can change the code to be dynamic!
    patches = tf .reshape(patches , (tf.shape(inputs)[0] , -1 , self.size * self.size * 3))
    #project the patches with "tf.keras.layers.Dense"
    patches = self.projection(patches)
     
    clsToken = tf.repeat(self.clsToken , tf.shape (inputs)[0], 0)
    patches = tf.concat((clsToken , patches ), axis = 1)
    #making positions with range. self.num_of_patches is number of positions
    #and the third input is our step
    positions = tf.range(0 , self.num_of_patches , 1)[tf.newaxis , ...]
    #adding positions to vectors
    positionalEmbedding = self.positionalEmbedding(positions)
    #print(posisionalEmbedding)
    patches = patches + positionalEmbedding
    return patches

In [None]:
embedding = PatchEmbedding(16,81,128)

In [None]:
result = embedding (tf.random.normal(shape = (32,144,144,3)))

In [None]:
print(result.shape)

In [None]:
#Transformer layer
class TransformerLayer(tf.keras.layers.Layer):
  def __init__ (self, d_model , heads , mlp_rate , dropout_rate = 0.1):
     super().__init__()

     self.layernorm_1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
     self.mha = tf.keras.layers.MultiHeadAttention(heads, d_model//heads , dropout = dropout_rate)
      
     self.layernorm_2 =tf.keras.layers.LayerNormalization(epsilon =1e-6)
     self.mlp = tf.keras.Sequential([
                                     tf.keras.layers.Dense(d_model * mlp_rate , activation = "gelu"), 
                                     tf.keras.layers.Dropout(dropout_rate),
                                     tf.keras.layers.Dense(d_model , activation = "gelu"),
                                     tf.keras.layers.Dropout(dropout_rate)
      ])
  def call(self, inputs, training = True):
       out_1 = self.layernorm_1(inputs)
       out_1 = self.mha(out_1, out_1, training = training)
       out_1 = inputs + out_1

       out_2 = self.layernorm_2(out_1)
       out_2 = self.mlp(out_2, training = training)
       out_2 = out_1 + out_2
       return out_2

In [None]:
#make a transformer encoder with transformer layers
class TransformerEncoder(tf.keras.layers.Layer):
  def __init__ (self, d_model , heads , mlp_rate , num_layers=1  , dropout_rate=0.1 ):
     super().__init__()
     self.encoders = [TransformerLayer(d_model , heads , mlp_rate , dropout_rate) for _ in range(num_layers)]
 
  def call(self , inputs , training = True):
    x = inputs

    for layer in self.encoders:
      x = layer(x, training = training)
    return x

In [None]:
class ViT(tf.keras.Model):
  def __init__(self, num_classes, patch_size, num_of_patches, d_model , heads , num_layers , mlp_rate , dropout_rate=0.1):
     super().__init__()

     self.patchEmbedding = PatchEmbedding(patch_size , num_of_patches , d_model)
     self.encoder = TransformerEncoder(d_model , heads , mlp_rate ,num_layers, dropout_rate)
     self.prediction = tf.keras.Sequential([
                                           tf.keras.layers.Dropout(0.3),
                                           tf.keras.layers.Dense(mlp_rate*d_model , activation = "gelu"), 
                                           tf.keras.layers.Dropout(0.3),
                                           tf.keras.layers.Dense(num_classes , activation = "softmax")
                                            
     ])
  



  def call(self, inputs , training = True):
     patches = self.patchEmbedding(inputs)
     #print(patches)
     encoderResult = self.encoder(patches , training = training)

     clsResult = encoderResult[:,0,:]

     prediction = self.prediction(clsResult,training=training)
     return prediction

In [None]:

#inja error mide!!!!!!!
vitClassifier = ViT(
                    100,
                    16,
                    81,
                    128,
                    2,
                    4,
                    2,
                    0.1)

In [None]:
vitClassifier(tf.random.normal(shape = (32 , 144 , 144 , 3)))

In [None]:
(x_train , y_train) , (x_test , y_test) = tf.keras.datasets.cifar10.load_data()

In [None]:
print(x_train.shape)

In [None]:
preprocessingModel = tf.keras.Sequential([
                                          tf.keras.layers.Normalization(),
                                          tf.keras.layers.Resizing(72,72),

])
preprocessingModel.layers[0].adapt(x_train)
augmentationModel = tf.keras.Sequential([
                                         tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
                                         tf.keras.layers.experimental.preprocessing.RandomRotation(factor = 0.2),
                                         tf.keras.layers.experimental.preprocessing.RandomZoom(width_factor = 0.2, height_factor =0.2 ),


                                         
])

In [None]:
from tensorflow.python.data.ops.dataset_ops import AUTOTUNE
def convert_to_dataset(data, batch_size, shuffle = False, augment = False):
  dataset = tf.data.Dataset.from_tensor_slices(data)
  dataset = dataset.map(lambda x, y:(preprocessingModel(x)[0],y), num_parallel_calls=tf.data.AUTOTUNE)
  if shuffle:
     dataset = dataset.shuffle(len(dataset))
  dataset = dataset.batch(batch_size,drop_remainder= True)
  if augment:
    dataset = dataset.map(lambda x, y:(augmentationModel(x , training = True),y), num_parallel_calls=tf.data.AUTOTUNE)
  return dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
trainingData = convert_to_dataset((x_train , y_train) , 1024 , shuffle = True , augment=True)
valData = convert_to_dataset ((x_test , y_test) , 1024 , shuffle = False , augment= False)

In [None]:
#print(trainingData)
#print(valData)

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
trainingDATA = convert_to_dataset((x_train , y_train) , 1024 , shuffle = True , augment=True)
valData = convert_to_dataset ((x_test , y_test) , 1024 , shuffle = False , augment= False)


In [None]:
from os import name
with strategy.scope():
  vitClassifier = ViT(
                      10, 
                       6, 
                      (72//6)**2, 
                      128, 
                      2,
                      4,
                      2,
                      0.1
      
  )
  vitClassifier.compile(
      loss = tf.keras.losses.SparseCategoricalCrossentropy(),
      optimizer = "adam",
      metrics = [
                 tf.keras.metrics.SparseCategoricalAccuracy(name = "accuracy"),
                 tf.keras.metrics.SparseTopKCategoricalAccuracy(name = "top_5_accuracy")

      ]
  )

In [None]:
vitClassifier.fit(trainingData, batch_size = 1024, validation_data = valData , epochs = 20)