In [2]:
!pip install einops

Collecting einops
  Downloading https://files.pythonhosted.org/packages/5d/a0/9935e030634bf60ecd572c775f64ace82ceddf2f504a5fd3902438f07090/einops-0.3.0-py2.py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.0


In [3]:
#구현하는 모델에서 쓰이는 모든 activation함수는 정의하여 드린 GELU 함수를 사용해야함.
#MultiHeadAttention에서 Head로 나눌때, 이미지를 patch로자른후 sequence로 만들때 Rearrange함수를 사용하면 편리함.(사용하지 않으셔도 됩니다)
#CIFAR10에 대한 test accuracy가 60프로 이상인 ViT모델을 만드시오.
import tensorflow as tf
from einops.layers.tensorflow import Rearrange
from tensorflow.keras.activations import gelu
GELU = lambda x : gelu(x)

In [4]:
#논문[1]에서 설명하는 MultiHeadAttention을 만들어라.
class MultiHeadedAttention(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension)
    def __init__(self, dimension, heads=8):
        super(MultiHeadedAttention, self).__init__()
        ############Write your code Here############ 
        
        self.heads=heads # head의 개수 
        self.scales=dimension**-0.5 #분모(scale 역할)
        
        self.vkq=tf.keras.layers.Dense(dimension*3,use_bias=False) ## value key query
        self.out=tf.keras.layers.Dense(dimension)
        
        self.rearrange_vkq=Rearrange('b n (vkq h d) -> vkq b h n d',vkq=3,h=self.heads)
        
        self.rearrange_out=Rearrange('b h n d -> b n (h d)')
        
        ############################################
    def call(self, inputs):
        output = None
        ############Write your code Here############
        
        vkq=self.vkq(inputs)
        vkq=self.rearrange_vkq(vkq)
        
        v=vkq[2];k=vkq[1];q=vkq[0]
        
        ## Attention(Q,K,V)=softmax(QKt/√dk)*V       

        qk_matrix=tf.einsum('abid,abjd->abij',q,k)*self.scales 
        softmax=tf.nn.softmax(qk_matrix,axis=-1)        
        output=tf.einsum('abij,abjd->abid',softmax,v)
        
        
        output=self.rearrange_out(output)
        output=self.out(output)
 
        ############################################
        return output

#인자로 받은 residual_function을 사용하여 real_function값을 return하여주는 Class를 만들어라.(call함수 참고)
class ResidualBlock(tf.keras.Model):
    def __init__(self, residual_function):
        super(ResidualBlock, self).__init__()
        ############Write your code Here############
        
        self.residual_function=residual_function
        
        ############################################

    def call(self, inputs):
        return self.residual_function(inputs) + inputs

#인자로 받은 normfunction에 들어가기전에 LayerNormalization을 해주는 Class를 만들어라.(call함수 참고)
class NormalizationBlock(tf.keras.Model):
    def __init__(self, norm_function, epsilon=1e-5):
        super(NormalizationBlock, self).__init__()
        ############Write your code Here############
         
        self.normalize=tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.norm_function=norm_function
        
        ############################################

    def call(self, inputs):
        return self.norm_function(self.normalize(inputs))

#논문[1]에서의 MLPBlock을 만들어라.
class MLPBlock(tf.keras.Model):
    #output_dimension - MLPBlock의 output dimension
    #hidden_dimension - MLPBlock의 hidden layer dimension
    def __init__(self, output_dimension, hidden_dimension):
        super(MLPBlock, self).__init__()
        ############Write your code Here############
        
        self.linear=tf.keras.Sequential([tf.keras.layers.Dense(hidden_dimension,activation=gelu),
                                     tf.keras.layers.Dense(output_dimension)])
        
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        
        output=self.linear(inputs)
        
        ############################################
        return output

#논문[1]을 읽고 TransformerEncoder를 위에서 정의한 class들을 사용하여 만들어라.
class TransformerEncoder(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), heads - MHA에서 head의 개수
    #depth - encoder layer의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    def __init__(self, dimension, depth, heads, mlp_dimension): 
        super(TransformerEncoder, self).__init__()
        layers_ = []
        for _ in range(depth):
            ############Write your code Here############ 
            
            layers_=layers_+[ResidualBlock(NormalizationBlock(MultiHeadedAttention(dimension,heads=heads)))]       
            layers_=layers_+[ResidualBlock(NormalizationBlock(MLPBlock(dimension,mlp_dimension)))]
            
            ############################################
        self.layers_ = tf.keras.Sequential(layers_)

    def call(self, inputs):
        return self.layers_(inputs)

#논문[2]를 읽고 ViT모델을 위에서 정의한 class들을 사용하여 만들어라.
class ImageTransformer(tf.keras.Model):
    #image_size - 이미지의 W==H의 크기(int), patch_size - 이미지를 쪼갤 patch의 크기(int)
    #n_classes - 최종 class의 개수, batch_size - 배치사이즈
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), depth - encoder layer의 개수
    #heads - MHA에서 head의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    #channel - input image에 대한 channel의 수
    def __init__(
            self, image_size, patch_size, n_classes, batch_size,
            dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension
        self.batch_size = batch_size

        self.positional_embedding = self.add_weight(
            "position_embeddings", shape=[num_patches + 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        self.classification_token = self.add_weight(
            "classification_token", shape=[1, 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        ############Write your code Here############
        
        self.embedding=tf.keras.layers.Dense(dimension)
        
        self.rearrange=Rearrange('b c (h p_1) (w p_2) -> b (h w) (p_1 p_2 c)',p_1=self.patch_size,p_2=self.patch_size)
        
        self.transformer=TransformerEncoder(dimension,depth,heads,mlp_dimension)
        
        self.cls=tf.identity
        
        self.mlp=tf.keras.Sequential([tf.keras.layers.Dense(mlp_dimension,activation=gelu),tf.keras.layers.Dense(n_classes)])
        
        
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        
        output=self.rearrange(inputs)
        output=self.embedding(output)

        cls=tf.broadcast_to(self.classification_token,(tf.shape(inputs)[0],1,self.dimension))
        
        output=tf.concat((cls,output),axis=1)
        output=output+self.positional_embedding
        output=self.transformer(output)
        
        output=self.cls(output[:,0])
        output=self.mlp(output)
        
        ############################################
        return output

In [5]:
from tensorflow.keras import datasets
# Download and prepare the CIFAR10 dataset
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
# Normalize pixel values to be between 0 and 1
############Write your code Here############

train_images,test_images=train_images/255.0,test_images/255.0

############################################
# Make image shape (BS, H, W, C) to (BS, C, H, W)
############Write your code Here############

train_images=tf.transpose(train_images,perm=[0,3,1,2])
test_images=tf.transpose(test_images,perm=[0,3,1,2])

############################################

#Initialize your model
#Initialize optimizer and loss and compile it to the model
############Write your code Here############


config={"image_size":32,"patch_size":4,"n_classes":10,"batch_size":1,
        "dimension":64,"depth":3,"heads":4,"mlp_dimension":128,"channels":3}
    
model=ImageTransformer(**config)
model.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
        
############################################

#Train your model
############Write your code Here############

model.fit(train_images,train_labels,epochs=10)

############################################
print('==============Training Finished===============')

#Evaluate your test samples
accuracy = 0
############Write your code Here############

_,accuracy=model.evaluate(test_images,test_labels)

############################################

print('Test Accuracy :', accuracy)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy : 0.6000999808311462
