In [1]:
!pip install einops

Collecting einops
  Downloading https://files.pythonhosted.org/packages/5d/a0/9935e030634bf60ecd572c775f64ace82ceddf2f504a5fd3902438f07090/einops-0.3.0-py2.py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.0


In [2]:
#구현하는 모델에서 쓰이는 모든 activation함수는 정의하여 드린 GELU 함수를 사용해야함.
#MultiHeadAttention에서 Head로 나눌때, 이미지를 patch로자른후 sequence로 만들때 Rearrange함수를 사용하면 편리함.(사용하지 않으셔도 됩니다)
#CIFAR10에 대한 test accuracy가 60프로 이상인 ViT모델을 만드시오.
import tensorflow as tf
from einops.layers.tensorflow import Rearrange
from tensorflow.keras.activations import gelu
GELU = lambda x : gelu(x)

In [11]:
#논문[1]에서 설명하는 MultiHeadAttention을 만들어라.
class MultiHeadedAttention(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension)
    def __init__(self, dimension, heads=8):
        super(MultiHeadedAttention, self).__init__()
        ############Write your code Here############
        self.heads = heads
        self.dimension = dimension

        self.rearrange_QKV = Rearrange('batch_size n (head dim) -> batch_size head n dim', head = self.heads)
        self.rearrange_output = Rearrange ('batch_size head n dim -> batch_size n (head dim)')

        self.wQ = tf.keras.layers.Dense(dimension)
        self.wK = tf.keras.layers.Dense(dimension)
        self.wV = tf.keras.layers.Dense(dimension)

        self.linear = tf.keras.layers.Dense(dimension)
        ############################################
    def call(self, inputs):
        output = None
        ############Write your code Here############
        batch_size = tf.shape(inputs)[0]
        
        # Linear Q, K, V + Rearrange Q, K, V 
        Q = self.rearrange_QKV (self.wQ (inputs))
        K = self.rearrange_QKV (self.wK (inputs))
        V = self.rearrange_QKV (self.wV (inputs))

        # Scaled Dot-Product Attention
        output = tf.matmul(Q, K, transpose_b = True) * (self.dimension ** (-1/2))
        output = tf.nn.softmax (output)
        output = tf.matmul (output, V)

        output = self.rearrange_output(output)
        output = self.linear (output)
        ############################################
        return output

#인자로 받은 residual_function을 사용하여 real_function값을 return하여주는 Class를 만들어라.(call함수 참고)
class ResidualBlock(tf.keras.Model):
    def __init__(self, residual_function):
        super(ResidualBlock, self).__init__()
        ############Write your code Here############
        self.residual_function = residual_function
        ############################################

    def call(self, inputs):
        return self.residual_function(inputs) + inputs

#인자로 받은 normfunction에 들어가기전에 LayerNormalization을 해주는 Class를 만들어라.(call함수 참고)
class NormalizationBlock(tf.keras.Model):
    def __init__(self, norm_function, epsilon=1e-5):
        super(NormalizationBlock, self).__init__()
        ############Write your code Here############
        self.norm_function = norm_function
        self.normalize = tf.keras.layers.LayerNormalization(epsilon=epsilon)
        ############################################

    def call(self, inputs):
        return self.norm_function(self.normalize(inputs))

#논문[1]에서의 MLPBlock을 만들어라.
class MLPBlock(tf.keras.Model):
    #output_dimension - MLPBlock의 output dimension
    #hidden_dimension - MLPBlock의 hidden layer dimension
    def __init__(self, output_dimension, hidden_dimension):
        super(MLPBlock, self).__init__()
        ############Write your code Here############
        self.linear1 = tf.keras.layers.Dense(hidden_dimension)
        self.linear2 = tf.keras.layers.Dense(output_dimension)
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        output = self.linear1 (inputs)
        output= GELU (output)
        output = self.linear2 (output)
        output= GELU (output)
        ############################################
        return output

#논문[1]을 읽고 TransformerEncoder를 위에서 정의한 class들을 사용하여 만들어라.
class TransformerEncoder(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), heads - MHA에서 head의 개수
    #depth - encoder layer의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    def __init__(self, dimension, depth, heads, mlp_dimension): 
        super(TransformerEncoder, self).__init__()
        layers_ = []
        for _ in range(depth):
            ############Write your code Here############
            layers_ += [
                        ResidualBlock (
                            NormalizationBlock (
                                MultiHeadedAttention (dimension, heads)
                            )
                        ), 

                        ResidualBlock (
                            NormalizationBlock (
                                MLPBlock (dimension, mlp_dimension)
                            )
                        )
            ]
            ############################################
        self.layers_ = tf.keras.Sequential(layers_)

    def call(self, inputs):
        return self.layers_(inputs)

#논문[2]를 읽고 ViT모델을 위에서 정의한 class들을 사용하여 만들어라.
class ImageTransformer(tf.keras.Model):
    #image_size - 이미지의 W==H의 크기(int), patch_size - 이미지를 쪼갤 patch의 크기(int)
    #n_classes - 최종 class의 개수, batch_size - 배치사이즈
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), depth - encoder layer의 개수
    #heads - MHA에서 head의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    #channel - input image에 대한 channel의 수
    def __init__(
            self, image_size, patch_size, n_classes, batch_size,
            dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension
        self.batch_size = batch_size

        self.positional_embedding = self.add_weight(
            "position_embeddings", shape=[num_patches + 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        self.classification_token = self.add_weight(
            "classification_token", shape=[1, 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        ############Write your code Here############
        self.rearrange_patch = Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)'
                                          , p1 = self.patch_size, p2 = self.patch_size)
        self.project = tf.keras.layers.Dense(dimension) 

        self.transformer = TransformerEncoder (dimension, depth, heads, mlp_dimension)

        self.mlp_head = tf.identity
        self.mlp = tf.keras.layers.Dense(mlp_dimension)
        
        self.linear = tf.keras.layers.Dense(n_classes)
        ############################################

    def call(self, inputs):
        output = None
        ############Write your code Here############
        batch_size = tf.shape(inputs)[0]

        # Linear Projection of Flattened Patches
        output = self.rearrange_patch (inputs)
        output = self.project (output)

        cls_tokens = tf.broadcast_to (
            self.classification_token,
            [batch_size, 1, self.dimension]
        )
        output = tf.concat ([cls_tokens, output], axis = 1)

        output += self.positional_embedding

        # Transformer Encoder
        output = self.transformer (output)
        output = self.mlp_head (output[:, 0])
        output = self.mlp (output)
            
        output = GELU (output)
        output = self.linear (output)
        ############################################
        return output

In [13]:
from tensorflow.keras import datasets
# Download and prepare the CIFAR10 dataset
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# Normalize pixel values to be between 0 and 1
############Write your code Here############
train_images = train_images / 255.0
test_images = test_images / 255.0
############################################

# Make image shape (BS, H, W, C) to (BS, C, H, W)
############Write your code Here############
train_images = tf.transpose (train_images, perm = [0, 3, 1, 2]) 
test_images = tf.transpose (test_images, perm = [0, 3, 1, 2])
############################################

#Initialize your model
#Initialize optimizer and loss and compile it to the model
############Write your code Here############
# one - hot encoding
train_labels = tf.one_hot (train_labels, 10)
train_labels = tf.squeeze (train_labels)
test_labels = tf.one_hot (test_labels, 10)
test_labels = tf.squeeze (test_labels)

# initiailize model
model = ImageTransformer(
            image_size=32, patch_size=4, n_classes=10, batch_size=64,
            dimension=64, depth=3, heads=4, mlp_dimension=128
        )

# initialize optimizer, loss, accuracy
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.CategoricalAccuracy('accuracy')

# compile model
model.compile (optimizer = optimizer, loss = loss, metrics = accuracy)
############################################

#Train your model
############Write your code Here############
model.fit(train_images, train_labels, batch_size=64, epochs=15)

# validation set
#val_images = train_images[:10000]
#train_images = train_images[10000:]
#val_labels = train_labels[:10000]
#train_labels = train_labels[10000:]

#model.fit(train_images, train_labels, batch_size=64, epochs=20, validation_data = (val_images, val_labels))
############################################
print('==============Training Finished===============')

#Evaluate your test samples
accuracy = 0
############Write your code Here############
_, accuracy = model.evaluate(test_images, test_labels, batch_size=64)
############################################

print('Test Accuracy :', accuracy)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Accuracy : 0.6116999983787537
