# ViT assignment

colab의 경우, 런타임 유형을 GPU로 바꿔주세요.

# 0. Setting

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from einops import repeat
from einops.layers.torch import Rearrange
from torch import Tensor
import math
import time

# 1. Project input to patches

In [4]:
class PatchProjection(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) * (img_size // patch_size) # 이미지 크기와 패치 크기에 따른 총 패치 수 계산

        self.projection = nn.Sequential(
            # TODO: 패치 임베딩을 위한 핵심 레이어 (Conv2d)와 차원 재배치(Rearrange)를 완성하세요
            # 힌트: Conv2d 커널 크기와 스트라이드는 patch_size와 같아야 합니다.
            #       Rearrange는 (b, emb_size, h, w) -> (b, h*w, emb_size) 형태로 변환합니다.
            nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
            Rearrange('b e h w -> b (h w) e')
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        # TODO: 입력 이미지를 patch embedding으로 변환하는 부분 완성
        return self.projection(x)


# Test
if __name__ == "__main__":
    x = torch.randn(8, 3, 224, 224)
    patch_proj = PatchProjection()
    out = patch_proj(x)
    print(f'Input shape: {x.shape}')
    print(f'Patch embeddings shape: {out.shape}')
    print(f'Number of patches: {patch_proj.num_patches}')

Input shape: torch.Size([8, 3, 224, 224])
Patch embeddings shape: torch.Size([8, 196, 768])
Number of patches: 196


# 2. Patches embedding

In [5]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.num_patches =(img_size // patch_size) * (img_size // patch_size) # 이미지 크기와 패치 크기에 따른 총 패치 수 계산

        # Patch projection
        self.projection = nn.Sequential(
            # TODO: 패치 임베딩을 위한 핵심 레이어 (Conv2d)와 차원 재배치(Rearrange)를 완성하세요
            # 힌트: Conv2d 커널 크기와 스트라이드는 patch_size와 같아야 합니다.
            #       Rearrange는 (b, emb_size, h, w) -> (b, h*w, emb_size) 형태로 변환합니다.
            nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
            Rearrange('b e h w -> b (h w) e')
        )

        # CLS token and positional encoding
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))
        self.positions = nn.Parameter(torch.randn(self.num_patches + 1, emb_size))

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        nn.init.trunc_normal_(self.cls_token, std=0.02)
        nn.init.trunc_normal_(self.positions, std=0.02)

    def forward(self, x: Tensor) -> Tensor:
        B = x.shape[0]

        # Project to patches
        # TODO: self.projection을 활용해 patch embedding을 수행하세요.
        x = self.projection(x)

        # Add CLS token
        # TODO: batch 크기에 맞게 cls_token을 확장하고 입력에 연결하세요.
        cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=B)
        x = torch.cat([cls_tokens, x], dim=1)

        # Add positional encoding
        # TODO: positional encoding을 더하세요.
        x += self.positions

        return x

# Test
if __name__ == "__main__":
    x = torch.randn(8, 3, 224, 224)
    patch_emb = PatchEmbedding()
    out = patch_emb(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Expected: (8, 197, 768)') # 196 patches + 1 CLS token

Input shape: torch.Size([8, 3, 224, 224])
Output shape: torch.Size([8, 197, 768])
Expected: (8, 197, 768)


# 3. Multi Head Attention (MHA)

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.scale = self.head_dim ** -0.5

        assert emb_size % num_heads == 0

        # Q, K, V projections
        # TODO: Q, K, V를 한 번에 만드는 선형레이어와
        #       출력 투영 선형레이어를 선언하세요.
        #       bias는 qkv에선 False로, proj에선 True(기본)로 둡니다.
        self.qkv = nn.Linear(emb_size, 3 * emb_size, bias=False)
        self.proj = nn.Linear(emb_size, emb_size)

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.xavier_uniform_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        # Generate Q, K, V
        # TODO: qkv 선형레이어로 Q,K,V 생성 후 (B, N, 3, num_heads, head_dim)으로 reshape 하고,
        #       (3, B, num_heads, N, head_dim)으로 permute하여 q,k,v로 분리하세요.
        qkv = self.qkv(x)
        qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)
        q, k, v = qkv

        # Attention computation
        # TODO: scaled dot-product attention 계산 후 softmax, dropout 적용
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = nn.Dropout(0.1)(attn)

        # Apply attention to values
        # TODO: attention 결과에 v를 곱하고,
        #       (B, N, C) 형태로 reshape 후 proj와 dropout 적용
        x = attn @ v
        x = x.transpose(1, 2).reshape(B, N, C)
        x = nn.Dropout(0.1)(x)
        x = self.proj(x)

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 197, 768)  # (batch, patches+cls, emb_size)
    mha = MultiHeadAttention()
    out = mha(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in mha.parameters()):,}')

Input shape: torch.Size([8, 197, 768])
Output shape: torch.Size([8, 197, 768])
Parameters: 2,360,064


# 4. Transformer Encoder Block

In [7]:
class MLP(nn.Module):
    def __init__(self, emb_size=768, mlp_ratio=4, dropout=0.1):
        super().__init__()
        hidden_size = int(emb_size * mlp_ratio)

        self.net = nn.Sequential(
            # TODO: nn.Sequential을 활용해
            #       emb_size -> hidden_size -> emb_size 순서의 MLP를 만드세요.
            #       중간에 GELU와 Dropout 포함
            nn.Linear(emb_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, emb_size),
            nn.Dropout(dropout)
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        return self.net(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.scale = self.head_dim ** -0.5

        # TODO: qkv, proj linear 레이어 선언 및 dropout 선언
        # qkv는 bias=False, proj는 기본
        self.qkv = nn.Linear(emb_size, 3 * emb_size, bias=False)
        self.proj = nn.Linear(emb_size, emb_size)

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.xavier_uniform_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        # TODO: qkv 생성, reshape, permute 후 q,k,v 분리
        qkv = self.qkv(x)
        qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)
        q, k, v = qkv

        # Attention computation

        # TODO: scaled dot-product attention 계산, softmax, dropout 적용
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = nn.Dropout(0.1)(attn)

        # Apply attention to values

        # TODO: attention 결과와 v 곱하고, 다시 proj와 dropout 적용
        x = attn @ v
        x = x.transpose(1, 2).reshape(B, N, C)
        x = nn.Dropout(0.1)(x)
        x = self.proj(x)


        return x


class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, mlp_ratio=4, dropout=0.1):
        super().__init__()

        # TODO: LayerNorm 2개, MultiHeadAttention, MLP 선언
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)
        self.attention = MultiHeadAttention(emb_size, num_heads, dropout)
        self.mlp = MLP(emb_size, mlp_ratio, dropout)


    def forward(self, x: Tensor) -> Tensor:
        # Pre-norm + residual connection for attention
        x = x + self.attention(self.norm1(x))

        # Pre-norm + residual connection for MLP
        x = x + self.mlp(self.norm2(x))

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 197, 768)
    block = TransformerEncoderBlock()
    out = block(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in block.parameters()):,}')

Input shape: torch.Size([8, 197, 768])
Output shape: torch.Size([8, 197, 768])
Parameters: 7,085,568


# 5. Complete ViT

In [8]:
class VisionTransformer(nn.Module):
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        emb_size=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        dropout=0.1,
        drop_path=0.0
    ):
        super().__init__()

        # Patch embedding (from Stage 2)
        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size, img_size)

        # Transformer encoder blocks (from Stage 4)
        self.blocks = nn.ModuleList([
            TransformerEncoderBlock(emb_size, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        # Classification head
        self.norm = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, num_classes)

        self._init_weights()

    def _init_weights(self):
        nn.init.trunc_normal_(self.head.weight, std=0.02)
        nn.init.constant_(self.head.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        # TODO: patch embedding 적용
        x = self.patch_embed(x)


        # TODO: Transformer encoder blocks 순차 적용
        for block in self.blocks:
            x = block(x)


        # Classification head (use CLS token)
        x = self.norm(x)
        cls_token = x[:, 0]  # Extract CLS token
        x = self.head(cls_token)

        return x


# Test
if __name__ == "__main__":
    # ViT-Base configuration
    model = VisionTransformer(
        img_size=224,
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        emb_size=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        dropout=0.1
    )

    x = torch.randn(2, 3, 224, 224)
    out = model(x)

    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Total parameters: {sum(p.numel() for p in model.parameters()):,}')

    # Different ViT configurations
    print('\n=== ViT Configurations ===')
    configs = {
        'ViT-Tiny': {'emb_size': 192, 'depth': 12, 'num_heads': 3},
        'ViT-Small': {'emb_size': 384, 'depth': 12, 'num_heads': 6},
        'ViT-Base': {'emb_size': 768, 'depth': 12, 'num_heads': 12},
        'ViT-Large': {'emb_size': 1024, 'depth': 24, 'num_heads': 16},
    }

    for name, config in configs.items():
        model = VisionTransformer(**config, num_classes=1000)
        params = sum(p.numel() for p in model.parameters())
        print(f'{name}: {params:,} parameters')

Input shape: torch.Size([2, 3, 224, 224])
Output shape: torch.Size([2, 1000])
Total parameters: 86,540,008

=== ViT Configurations ===
ViT-Tiny: 5,710,504 parameters
ViT-Small: 22,036,840 parameters
ViT-Base: 86,540,008 parameters
ViT-Large: 304,252,904 parameters


# 6. ViT for CIFAR-10

위의 코드를 완성했다면, 아래 코드를 실행하여 전체 모델을 테스트할 수 있습니다.

In [9]:
class ViTCIFAR10(nn.Module):
    def __init__(
        self,
        img_size=32,
        patch_size=4,
        in_channels=3,
        num_classes=10,
        emb_size=256,
        depth=6,
        num_heads=8,
        mlp_ratio=4,
        dropout=0.1
    ):
        super().__init__()

        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size, img_size)

        self.blocks = nn.ModuleList([
            TransformerEncoderBlock(emb_size, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        self.norm = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, num_classes)

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)
            elif isinstance(m, nn.Conv2d):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        if hasattr(self.patch_embed, 'cls_token'):
            nn.init.trunc_normal_(self.patch_embed.cls_token, std=0.02)
        if hasattr(self.patch_embed, 'positions'):
            nn.init.trunc_normal_(self.patch_embed.positions, std=0.02)

    def forward(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)
        cls_token = x[:, 0]
        x = self.head(cls_token)

        return x


def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        running_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}: Loss {loss.item():.4f}, Acc {100.*correct/total:.2f}%')

    return running_loss / len(dataloader), 100. * correct / total


def test(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

    test_loss /= len(dataloader)
    accuracy = 100. * correct / total

    return test_loss, accuracy


def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    transform_train = transforms.Compose([
        transforms.RandomResizedCrop(32, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4, pin_memory=True)

    model = ViTCIFAR10(
        img_size=32,
        patch_size=4,
        num_classes=10,
        emb_size=256,
        depth=6,
        num_heads=4,
        dropout=0.1
    ).to(device)

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(
        model.parameters(),
        lr=5e-4,
        weight_decay=0.03,
        betas=(0.9, 0.999)
    )

    num_epochs = 30
    warmup_epochs = 1
    total_steps = len(train_loader) * num_epochs
    warmup_steps = len(train_loader) * warmup_epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

    best_acc = 0
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')

        start_time = time.time()
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = test(model, test_loader, criterion, device)

        for _ in range(len(train_loader)):
            scheduler.step()

        epoch_time = time.time() - start_time
        current_lr = optimizer.param_groups[0]['lr']

        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
        print(f'LR: {current_lr:.6f}, Epoch time: {epoch_time:.2f}s')

        if test_acc > best_acc:
            best_acc = test_acc
            torch.save(model.state_dict(), 'vit_cifar10_best.pth')

        if test_acc > 90.0:
            print(f"Reached target accuracy!")
            break

    print(f'\nBest Test Accuracy: {best_acc:.2f}%')


if __name__ == "__main__":
    model = ViTCIFAR10(emb_size=256, depth=6, num_heads=4)
    x = torch.randn(4, 3, 32, 32)
    out = model(x)

    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in model.parameters()):,}')

    main()

Input shape: torch.Size([4, 3, 32, 32])
Output shape: torch.Size([4, 10])
Parameters: 4,766,474


100%|██████████| 170M/170M [00:04<00:00, 41.0MB/s]



Epoch 1/30
Batch 0: Loss 2.3489, Acc 12.50%
Batch 100: Loss 2.3571, Acc 11.26%
Train Loss: 2.3399, Train Acc: 11.28%
Test Loss: 2.3411, Test Acc: 10.88%
LR: 0.000500, Epoch time: 40.35s

Epoch 2/30
Batch 0: Loss 2.3265, Acc 8.59%
Batch 100: Loss 1.9725, Acc 28.19%
Train Loss: 1.9271, Train Acc: 31.53%
Test Loss: 1.7585, Test Acc: 39.92%
LR: 0.000499, Epoch time: 39.62s

Epoch 3/30
Batch 0: Loss 1.8022, Acc 39.06%
Batch 100: Loss 1.8551, Acc 40.68%
Train Loss: 1.7196, Train Acc: 42.52%
Test Loss: 1.6510, Test Acc: 46.41%
LR: 0.000494, Epoch time: 40.39s

Epoch 4/30
Batch 0: Loss 1.6928, Acc 41.02%
Batch 100: Loss 1.6133, Acc 46.91%




Train Loss: 1.6129, Train Acc: 47.85%
Test Loss: 1.6042, Test Acc: 49.08%
LR: 0.000487, Epoch time: 40.06s

Epoch 5/30
Batch 0: Loss 1.5989, Acc 50.00%
Batch 100: Loss 1.5312, Acc 51.10%




Train Loss: 1.5505, Train Acc: 51.42%
Test Loss: 1.5387, Test Acc: 51.53%
LR: 0.000477, Epoch time: 40.30s

Epoch 6/30
Batch 0: Loss 1.4777, Acc 54.69%
Batch 100: Loss 1.5520, Acc 52.31%
Train Loss: 1.5110, Train Acc: 53.01%
Test Loss: 1.5623, Test Acc: 51.87%
LR: 0.000464, Epoch time: 40.61s

Epoch 7/30
Batch 0: Loss 1.5129, Acc 52.73%
Batch 100: Loss 1.4896, Acc 54.36%
Train Loss: 1.4721, Train Acc: 54.86%
Test Loss: 1.4607, Test Acc: 54.88%
LR: 0.000449, Epoch time: 41.13s

Epoch 8/30
Batch 0: Loss 1.3762, Acc 56.25%
Batch 100: Loss 1.4065, Acc 55.86%
Train Loss: 1.4433, Train Acc: 56.34%
Test Loss: 1.4338, Test Acc: 56.71%
LR: 0.000431, Epoch time: 40.44s

Epoch 9/30
Batch 0: Loss 1.4497, Acc 54.30%
Batch 100: Loss 1.3908, Acc 57.64%
Train Loss: 1.4103, Train Acc: 58.20%
Test Loss: 1.4136, Test Acc: 57.77%
LR: 0.000412, Epoch time: 40.40s

Epoch 10/30
Batch 0: Loss 1.3942, Acc 57.81%
Batch 100: Loss 1.4001, Acc 59.26%
Train Loss: 1.3858, Train Acc: 59.28%
Test Loss: 1.4728, Test Ac



Batch 0: Loss 1.2023, Acc 69.14%
Batch 100: Loss 1.2789, Acc 63.60%
Train Loss: 1.2987, Train Acc: 63.53%
Test Loss: 1.3141, Test Acc: 62.94%
LR: 0.000290, Epoch time: 40.53s

Epoch 15/30
Batch 0: Loss 1.2966, Acc 64.45%
Batch 100: Loss 1.2550, Acc 64.35%
Train Loss: 1.2746, Train Acc: 64.46%
Test Loss: 1.3105, Test Acc: 63.20%
LR: 0.000264, Epoch time: 40.31s

Epoch 16/30
Batch 0: Loss 1.2920, Acc 66.02%
Batch 100: Loss 1.2826, Acc 65.66%
Train Loss: 1.2559, Train Acc: 65.46%
Test Loss: 1.2852, Test Acc: 64.16%
LR: 0.000236, Epoch time: 40.37s

Epoch 17/30
Batch 0: Loss 1.2554, Acc 65.62%
Batch 100: Loss 1.2859, Acc 66.24%
Train Loss: 1.2309, Train Acc: 66.48%
Test Loss: 1.2761, Test Acc: 64.60%
LR: 0.000210, Epoch time: 40.34s

Epoch 18/30
Batch 0: Loss 1.1864, Acc 68.36%
Batch 100: Loss 1.2562, Acc 67.83%




Train Loss: 1.2101, Train Acc: 67.53%
Test Loss: 1.2671, Test Acc: 65.44%
LR: 0.000183, Epoch time: 40.57s

Epoch 19/30
Batch 0: Loss 1.2185, Acc 68.36%
Batch 100: Loss 1.1068, Acc 68.71%
Train Loss: 1.1894, Train Acc: 68.43%
Test Loss: 1.2423, Test Acc: 65.80%
LR: 0.000157, Epoch time: 40.67s

Epoch 20/30
Batch 0: Loss 1.2202, Acc 66.41%
Batch 100: Loss 1.1874, Acc 69.96%
Train Loss: 1.1698, Train Acc: 69.57%
Test Loss: 1.2317, Test Acc: 66.41%
LR: 0.000133, Epoch time: 41.45s

Epoch 21/30
Batch 0: Loss 1.1024, Acc 73.44%
Batch 100: Loss 1.1225, Acc 70.60%
Train Loss: 1.1515, Train Acc: 70.53%
Test Loss: 1.2299, Test Acc: 67.12%
LR: 0.000110, Epoch time: 40.86s

Epoch 22/30
Batch 0: Loss 1.1467, Acc 69.53%
Batch 100: Loss 1.1401, Acc 71.28%
Train Loss: 1.1281, Train Acc: 71.34%
Test Loss: 1.2040, Test Acc: 67.94%
LR: 0.000088, Epoch time: 40.31s

Epoch 23/30
Batch 0: Loss 0.9980, Acc 77.73%
Batch 100: Loss 1.0338, Acc 72.52%
Train Loss: 1.1120, Train Acc: 72.41%
Test Loss: 1.2015, Tes



Train Loss: 1.0933, Train Acc: 72.94%
Test Loss: 1.2091, Test Acc: 68.12%
LR: 0.000051, Epoch time: 40.48s

Epoch 25/30
Batch 0: Loss 1.1581, Acc 70.31%
Batch 100: Loss 1.0652, Acc 73.86%
Train Loss: 1.0789, Train Acc: 73.86%
Test Loss: 1.2009, Test Acc: 68.13%
LR: 0.000036, Epoch time: 40.35s

Epoch 26/30
Batch 0: Loss 1.0255, Acc 78.12%
Batch 100: Loss 1.1471, Acc 74.27%




Train Loss: 1.0662, Train Acc: 74.16%
Test Loss: 1.1924, Test Acc: 69.12%
LR: 0.000023, Epoch time: 40.78s

Epoch 27/30
Batch 0: Loss 1.1036, Acc 76.56%
Batch 100: Loss 1.1137, Acc 75.00%
Train Loss: 1.0542, Train Acc: 74.89%
Test Loss: 1.1763, Test Acc: 69.64%
LR: 0.000013, Epoch time: 41.90s

Epoch 28/30
Batch 0: Loss 0.9883, Acc 76.95%
Batch 100: Loss 1.0005, Acc 75.23%
Train Loss: 1.0486, Train Acc: 75.10%
Test Loss: 1.1843, Test Acc: 69.57%
LR: 0.000006, Epoch time: 40.67s

Epoch 29/30
Batch 0: Loss 1.0646, Acc 75.00%
Batch 100: Loss 1.0803, Acc 75.89%
Train Loss: 1.0402, Train Acc: 75.56%
Test Loss: 1.1764, Test Acc: 69.94%
LR: 0.000001, Epoch time: 40.36s

Epoch 30/30
Batch 0: Loss 1.0307, Acc 77.34%
Batch 100: Loss 1.0803, Acc 75.64%
Train Loss: 1.0397, Train Acc: 75.53%
Test Loss: 1.1738, Test Acc: 70.00%
LR: 0.000000, Epoch time: 40.34s

Best Test Accuracy: 70.00%


ViT는 일반적으로 대규모 데이터셋에서 사전 학습된(pretrained) 모델을 활용하는 경우가 많기 때문에, 하이퍼파라미터를 조정하거나 학습 epoch을 늘리면 성능이 개선될 수는 있지만, 소규모 데이터셋에서 처음부터 학습한 ViT의 성능이 낮은 것은 구조적 한계에 가깝습니다.