# ViT assignment

colab의 경우, 런타임 유형을 GPU로 바꿔주세요.

# 0. Setting

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from einops import repeat
from einops.layers.torch import Rearrange
from torch import Tensor
import math
import time

# 1. Project input to patches

In [3]:
class PatchProjection(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.patch_size = patch_size
        # N = HW/P^2 (H,W는 이미지 해상도, P는 패치 크기)
        # 224x224 이미지를 16x16 패치로 나누면 (224/16)^2 = 14^2 = 196개의 패치가 생성됨
        self.num_patches = (img_size // patch_size) ** 2  # 이미지 크기와 패치 크기에 따른 총 패치 수 계산

        self.projection = nn.Sequential(
            # trainable linear projection을 Conv2d로 구현
            # 입력: (batch, 3, 224, 224) -> 출력: (batch, 768, 14, 14)
            nn.Conv2d(
                in_channels=in_channels,     # RGB 채널 수 (3)
                out_channels=emb_size,       # 임베딩 차원 (768)
                kernel_size=patch_size,      # 16x16 패치 크기
                stride=patch_size,           # 16픽셀씩 이동하여 겹치지 않게 함
                bias=True                    # 선형 변환에 bias 포함
            ),
            # einops 사용해 공간 차원을 시퀀스 차원으로 재배치: (batch, emb_size, height, width) -> (batch, height*width, emb_size)
            # (batch, 768, 14, 14) -> (batch, 196, 768)
            Rearrange('b e h w -> b (h w) e')
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        # 입력 이미지를 patch embedding으로 변환
        # 입력 이미지 텐서 (batch_size, channels, height, width) -> 패치 임베딩 텐서 (batch_size, num_patches, embedding_dim)
        patch_embeddings = self.projection(x)
        return patch_embeddings


# Test
if __name__ == "__main__":
    x = torch.randn(8, 3, 224, 224)
    patch_proj = PatchProjection()
    out = patch_proj(x)
    print(f'Input shape: {x.shape}')
    print(f'Patch embeddings shape: {out.shape}')
    print(f'Number of patches: {patch_proj.num_patches}')

Input shape: torch.Size([8, 3, 224, 224])
Patch embeddings shape: torch.Size([8, 196, 768])
Number of patches: 196


# 2. Patches embedding

In [4]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.num_patches = (img_size // patch_size) ** 2

        # Patch projection
        self.projection = nn.Sequential(
            # trainable linear projection을 Conv2d로 구현
            # 입력: (batch, 3, 224, 224) -> 출력: (batch, 768, 14, 14)
            nn.Conv2d(
                in_channels=in_channels,     # RGB 채널 수 (3)
                out_channels=emb_size,       # 임베딩 차원 (768)
                kernel_size=patch_size,      # 16x16 패치 크기
                stride=patch_size,           # 16픽셀씩 이동하여 겹치지 않게 함
                bias=True                    # 선형 변환에 bias 포함
            ),
            # einops 사용해 공간 차원을 시퀀스 차원으로 재배치: (batch, emb_size, height, width) -> (batch, height*width, emb_size)
            # (batch, 768, 14, 14) -> (batch, 196, 768)
            Rearrange('b e h w -> b (h w) e')
        )

        # CLS token and positional encoding
        # CLS token은 전체 이미지의 표현을 학습하는 특별한 토큰: (1, 1, emb_size) 크기로 생성하여 나중에 배치 크기에 맞게 확장
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))

        # 패치 196개 + CLS token 1개 = 총 197개 위치에 대한 positional embedding
        # 학습 가능한 1D positional embedding 사용
        self.positions = nn.Parameter(torch.randn(self.num_patches + 1, emb_size))

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        nn.init.trunc_normal_(self.cls_token, std=0.02)
        nn.init.trunc_normal_(self.positions, std=0.02)


    # 입력 이미지를 CLS token과 positional encoding이 포함된 임베딩으로 변환
    # 입력 이미지 텐서 (batch_size, channels, height, width) -> 완전한 패치 임베딩 텐서 (batch_size, num_patches+1, embedding_dim)
    def forward(self, x: Tensor) -> Tensor:
        B = x.shape[0]

        # Project to patches
        # 입력 이미지를 패치 임베딩으로 변환: (B, 3, 224, 224) -> (B, 196, 768)
        x = self.projection(x)

        # Add CLS token
        # CLS token을 배치 크기에 맞게 확장: (1, 1, 768) -> (B, 1, 768)
        # 그리고 패치 임베딩 앞에 연결: (B, 196, 768) -> (B, 197, 768)
        cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, 768)
        x = torch.cat([cls_tokens, x], dim=1)  # (B, 197, 768)


        # Add positional encoding
        # 위치 정보를 임베딩에 더해 각 패치와 CLS token의 위치를 학습 - 브로드캐스팅으로 모든 배치에 동일한 positional encoding 적용
        x = x + self.positions

        return x

# Test
if __name__ == "__main__":
    x = torch.randn(8, 3, 224, 224)
    patch_emb = PatchEmbedding()
    out = patch_emb(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Expected: (8, 197, 768)') # 196 patches + 1 CLS token

Input shape: torch.Size([8, 3, 224, 224])
Output shape: torch.Size([8, 197, 768])
Expected: (8, 197, 768)


# 3. Multi Head Attention (MHA)

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.scale = self.head_dim ** -0.5

        assert emb_size % num_heads == 0

        # Q, K, V projections
        # Q, K, V를 한 번에 계산하는 단일 선형 레이어 - 출력 차원: emb_size * 3 (Q, K, V 각각 emb_size 차원)
        self.qkv = nn.Linear(emb_size, emb_size * 3, bias=False)
        # 각 헤드의 출력을 연결한 후 최종 투영: MultiHead(Q,K,V) = Concat(head1, ..., headh) * WO
        self.proj = nn.Linear(emb_size, emb_size, bias=True)
        # 정규화를 위한 드롭아웃
        self.dropout = nn.Dropout(dropout)

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.xavier_uniform_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    # Multi-Head Self-Attention 계산
    # 입력 텐서 (batch_size, sequence_length, embedding_dim) -> 어텐션 적용된 출력 텐서 (batch_size, sequence_length, embedding_dim)
    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape    # B: 배치크기, N: 시퀀스길이(토큰수), C: 임베딩차원

        # Generate Q, K, V
        qkv = self.qkv(x)    # (B, N, C) -> (B, N, 3*C)

        # multi-head 형태로 reshape: (B, N, 3*C) -> (B, N, 3, num_heads, head_dim) -> (3, B, num_heads, N, head_dim)
        qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # 각각 (B, num_heads, N, head_dim)

        # Attention computation: : Attention(Q,K,V) = softmax(QK^T/√dk)V
        # Q와 K의 내적 계산: (B, num_heads, N, head_dim) @ (B, num_heads, head_dim, N) -> (B, num_heads, N, N) - 각 토큰 쌍 간의 유사도
        attn = (q @ k.transpose(-2, -1)) * self.scale
        # Softmax로 어텐션 가중치 정규화 - 각 쿼리에 대해 모든 키의 가중치 합이 1
        attn = attn.softmax(dim=-1)
        # 학습 안정성을 위한 dropout 적용
        attn = self.dropout(attn)

        # Apply attention to values
        # (B, num_heads, N, N) @ (B, num_heads, N, head_dim) -> (B, num_heads, N, head_dim)
        x = attn @ v
        # multi-head 결과들을 연결: (B, num_heads, N, head_dim) -> (B, N, num_heads * head_dim)
        # = (B, N, C) - 원본 임베딩 차원으로 복원
        x = x.transpose(1, 2).reshape(B, N, C)

        # 최종 linear projection 및 dropout 적용
        x = self.proj(x)
        x = self.dropout(x)

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 197, 768)  # (batch, patches+cls, emb_size)
    mha = MultiHeadAttention()
    out = mha(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in mha.parameters()):,}')

Input shape: torch.Size([8, 197, 768])
Output shape: torch.Size([8, 197, 768])
Parameters: 2,360,064


# 4. Transformer Encoder Block

In [6]:
class MLP(nn.Module):
    def __init__(self, emb_size=768, mlp_ratio=4, dropout=0.1):
        super().__init__()
        hidden_size = int(emb_size * mlp_ratio)

        self.net = nn.Sequential(
            # emb_size 768 -> hidden_size 3072 (mlp_ratio=4일 때)
            nn.Linear(emb_size, hidden_size),
            # GELU 활성화 함수 (ReLU보다 부드러운 곡선을 가져 gradient flow 개선)
            nn.GELU(),
            # 정규화 위한 dropout
            nn.Dropout(dropout),
            # hidden_size 3072 -> emb_size 768 (원본 차원으로 복원)
            nn.Linear(hidden_size, emb_size),
            # 최종 dropout
            nn.Dropout(dropout)
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        return self.net(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.scale = self.head_dim ** -0.5

        # [q, k, v] = z * Uqkv, where Uqkv ∈ R^(D×3Dh)
        self.qkv = nn.Linear(emb_size, emb_size * 3, bias=False)
        # 최종 출력 투영 레이어 MultiHead(Q,K,V) = Concat(head1, ..., headh) * WO
        self.proj = nn.Linear(emb_size, emb_size, bias=True)
        # 정규화 위한 dropout
        self.dropout = nn.Dropout(dropout)

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.xavier_uniform_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        # QKV 생성 및 multi-head 형태로 분리
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        # scaled dot-product attention 계산, softmax, dropout 적용
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.dropout(attn)

        # attention 가중치를 값에 적용하고 multi-head 결과 연결
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)

        # 최종 linear proj 및 dropout
        x = self.proj(x)
        x = self.dropout(x)

        return x


class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, mlp_ratio=4, dropout=0.1):
        super().__init__()

        # Layernorm (LN) before every block, residual connections after every block
        # 첫 번째 LayerNorm -> Multi-Head Self-Attention
        self.norm1 = nn.LayerNorm(emb_size)
        self.attention = MultiHeadAttention(emb_size, num_heads, dropout)

        # 두 번째 LayerNorm -> Position-wise Feed-Forward Network (MLP)
        self.norm2 = nn.LayerNorm(emb_size)
        self.mlp = MLP(emb_size, mlp_ratio, dropout)


    def forward(self, x: Tensor) -> Tensor:
        # Pre-norm + residual connection for attention
        x = x + self.attention(self.norm1(x))

        # Pre-norm + residual connection for MLP
        x = x + self.mlp(self.norm2(x))

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 197, 768)
    block = TransformerEncoderBlock()
    out = block(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in block.parameters()):,}')

Input shape: torch.Size([8, 197, 768])
Output shape: torch.Size([8, 197, 768])
Parameters: 7,085,568


# 5. Complete ViT

In [7]:
class VisionTransformer(nn.Module):
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        emb_size=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        dropout=0.1,
        drop_path=0.0
    ):
        super().__init__()

        # Patch embedding (from Stage 2)
        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size, img_size)

        # Transformer encoder blocks (from Stage 4)
        self.blocks = nn.ModuleList([
            TransformerEncoderBlock(emb_size, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        # Classification head
        self.norm = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, num_classes)

        self._init_weights()

    def _init_weights(self):
        nn.init.trunc_normal_(self.head.weight, std=0.02)
        nn.init.constant_(self.head.bias, 0)

    # 입력 이미지 (batch_size, channels, height, width) -> 클래스 예측 logits (batch_size, num_classes)
    def forward(self, x: Tensor) -> Tensor:

        # patch embedding 적용: 이미지 -> 패치 시퀀스 변환
        x = self.patch_embed(x)

        # Transformer encoder blocks 순차 적용: 각 블록에서 Multi-Head Self-Attention과 MLP 수행하면서 모든 패치와 CLS 토큰 간 관계 학습
        for block in self.blocks:
            x = block(x)

        # Classification head (use CLS token)
        x = self.norm(x)
        cls_token = x[:, 0]  # Extract CLS token: (batch_size, emb_size)
        x = self.head(cls_token)  # (batch_size, num_classes)

        return x


# Test
if __name__ == "__main__":
    # ViT-Base configuration
    model = VisionTransformer(
        img_size=224,
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        emb_size=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        dropout=0.1
    )

    x = torch.randn(2, 3, 224, 224)
    out = model(x)

    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Total parameters: {sum(p.numel() for p in model.parameters()):,}')

    # Different ViT configurations
    print('\n=== ViT Configurations ===')
    configs = {
        'ViT-Tiny': {'emb_size': 192, 'depth': 12, 'num_heads': 3},
        'ViT-Small': {'emb_size': 384, 'depth': 12, 'num_heads': 6},
        'ViT-Base': {'emb_size': 768, 'depth': 12, 'num_heads': 12},
        'ViT-Large': {'emb_size': 1024, 'depth': 24, 'num_heads': 16},
    }

    for name, config in configs.items():
        model = VisionTransformer(**config, num_classes=1000)
        params = sum(p.numel() for p in model.parameters())
        print(f'{name}: {params:,} parameters')

Input shape: torch.Size([2, 3, 224, 224])
Output shape: torch.Size([2, 1000])
Total parameters: 86,540,008

=== ViT Configurations ===
ViT-Tiny: 5,710,504 parameters
ViT-Small: 22,036,840 parameters
ViT-Base: 86,540,008 parameters
ViT-Large: 304,252,904 parameters


# 6. ViT for CIFAR-10

위의 코드를 완성했다면, 아래 코드를 실행하여 전체 모델을 테스트할 수 있습니다.

In [9]:
class ViTCIFAR10(nn.Module):
    def __init__(
        self,
        img_size=32,
        patch_size=4,
        in_channels=3,
        num_classes=10,
        emb_size=256,
        depth=6,
        num_heads=8,
        mlp_ratio=4,
        dropout=0.1
    ):
        super().__init__()

        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size, img_size)

        self.blocks = nn.ModuleList([
            TransformerEncoderBlock(emb_size, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        self.norm = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, num_classes)

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)
            elif isinstance(m, nn.Conv2d):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        if hasattr(self.patch_embed, 'cls_token'):
            nn.init.trunc_normal_(self.patch_embed.cls_token, std=0.02)
        if hasattr(self.patch_embed, 'positions'):
            nn.init.trunc_normal_(self.patch_embed.positions, std=0.02)

    def forward(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)
        cls_token = x[:, 0]
        x = self.head(cls_token)

        return x


def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        running_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}: Loss {loss.item():.4f}, Acc {100.*correct/total:.2f}%')

    return running_loss / len(dataloader), 100. * correct / total


def test(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

    test_loss /= len(dataloader)
    accuracy = 100. * correct / total

    return test_loss, accuracy


def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    transform_train = transforms.Compose([
        transforms.RandomResizedCrop(32, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4, pin_memory=True)

    model = ViTCIFAR10(
        img_size=32,
        patch_size=4,
        num_classes=10,
        emb_size=256,
        depth=6,
        num_heads=4,
        dropout=0.1
    ).to(device)

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(
        model.parameters(),
        lr=5e-4,
        weight_decay=0.03,
        betas=(0.9, 0.999)
    )

    num_epochs = 100
    warmup_epochs = 1
    total_steps = len(train_loader) * num_epochs
    warmup_steps = len(train_loader) * warmup_epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

    best_acc = 0
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')

        start_time = time.time()
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = test(model, test_loader, criterion, device)

        for _ in range(len(train_loader)):
            scheduler.step()

        epoch_time = time.time() - start_time
        current_lr = optimizer.param_groups[0]['lr']

        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
        print(f'LR: {current_lr:.6f}, Epoch time: {epoch_time:.2f}s')

        if test_acc > best_acc:
            best_acc = test_acc
            torch.save(model.state_dict(), 'vit_cifar10_best.pth')

        if test_acc > 90.0:
            print(f"Reached target accuracy!")
            break

    print(f'\nBest Test Accuracy: {best_acc:.2f}%')


if __name__ == "__main__":
    model = ViTCIFAR10(emb_size=256, depth=6, num_heads=4)
    x = torch.randn(4, 3, 32, 32)
    out = model(x)

    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in model.parameters()):,}')

    main()

Input shape: torch.Size([4, 3, 32, 32])
Output shape: torch.Size([4, 10])
Parameters: 4,766,474

Epoch 1/100
Batch 0: Loss 2.3571, Acc 13.28%
Batch 100: Loss 2.3376, Acc 10.44%
Train Loss: 2.3660, Train Acc: 10.46%
Test Loss: 2.3752, Test Acc: 10.52%
LR: 0.000500, Epoch time: 11.11s

Epoch 2/100
Batch 0: Loss 2.3704, Acc 11.72%
Batch 100: Loss 1.8758, Acc 26.73%
Train Loss: 1.9316, Train Acc: 30.78%
Test Loss: 1.7793, Test Acc: 39.29%
LR: 0.000500, Epoch time: 11.16s

Epoch 3/100
Batch 0: Loss 1.7789, Acc 44.92%
Batch 100: Loss 1.7709, Acc 40.53%
Train Loss: 1.7214, Train Acc: 42.26%
Test Loss: 1.6410, Test Acc: 46.75%
LR: 0.000499, Epoch time: 11.09s

Epoch 4/100
Batch 0: Loss 1.7120, Acc 43.75%
Batch 100: Loss 1.5628, Acc 46.77%
Train Loss: 1.6200, Train Acc: 47.69%
Test Loss: 1.6166, Test Acc: 48.21%
LR: 0.000499, Epoch time: 11.09s

Epoch 5/100
Batch 0: Loss 1.5919, Acc 46.48%
Batch 100: Loss 1.6283, Acc 50.16%
Train Loss: 1.5615, Train Acc: 50.84%
Test Loss: 1.4984, Test Acc: 53.7

ViT는 일반적으로 대규모 데이터셋에서 사전 학습된(pretrained) 모델을 활용하는 경우가 많기 때문에, 하이퍼파라미터를 조정하거나 학습 epoch을 늘리면 성능이 개선될 수는 있지만, 소규모 데이터셋에서 처음부터 학습한 ViT의 성능이 낮은 것은 구조적 한계에 가깝습니다.