# ViT assignment

colab의 경우, 런타임 유형을 GPU로 바꿔주세요.

# 0. Setting

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from einops import repeat
from einops.layers.torch import Rearrange
from torch import Tensor
import math
import time
from tqdm import tqdm

# 1. Project input to patches

In [2]:
class PatchProjection(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (img_size * img_size) // (patch_size * patch_size) # 이미지 크기와 패치 크기에 따른 총 패치 수 계산

        self.projection = nn.Sequential(
            nn.Conv2d(in_channels, emb_size, patch_size, patch_size),
            Rearrange('b emb_size h w -> b (h w) emb_size')
            # TODO: 패치 임베딩을 위한 핵심 레이어 (Conv2d)와 차원 재배치(Rearrange)를 완성하세요
            # 힌트: Conv2d 커널 크기와 스트라이드는 patch_size와 같아야 합니다. -> 커널이 겹치지 않게 하기 위해
            #       Rearrange는 (b, emb_size, h, w) -> (b, h*w, emb_size) 형태로 변환합니다.
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        x = self.projection(x)

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 3, 224, 224)
    patch_proj = PatchProjection()
    out = patch_proj(x)
    print(f'Input shape: {x.shape}')
    print(f'Patch embeddings shape: {out.shape}')
    print(f'Number of patches: {patch_proj.num_patches}')

Input shape: torch.Size([8, 3, 224, 224])
Patch embeddings shape: torch.Size([8, 196, 768])
Number of patches: 196


# 2. Patches embedding

In [3]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (img_size * img_size) // (patch_size * patch_size) # 이미지 크기와 패치 크기에 따른 총 패치 수 계산

        self.projection = nn.Sequential(
            nn.Conv2d(in_channels, emb_size, patch_size, patch_size),
            Rearrange('b emb_size h w -> b (h w) emb_size')
            # TODO: 패치 임베딩을 위한 핵심 레이어 (Conv2d)와 차원 재배치(Rearrange)를 완성하세요
            # 힌트: Conv2d 커널 크기와 스트라이드는 patch_size와 같아야 합니다. -> 커널이 겹치지 않게 하기 위해
            #       Rearrange는 (b, emb_size, h, w) -> (b, h*w, emb_size) 형태로 변환합니다.
        )


        # CLS token and positional encoding
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))  # 맨 앞대가리에만 붙음
        self.positions = nn.Parameter(torch.randn(self.num_patches + 1, emb_size))  # 패치 개수 + cls_token

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        nn.init.trunc_normal_(self.cls_token, std=0.02)
        nn.init.trunc_normal_(self.positions, std=0.02)

    def forward(self, x: Tensor) -> Tensor:
        B = x.shape[0]
        x = self.projection(x)        # Project to patches
        # TODO: self.projection을 활용해 patch embedding을 수행하세요.

        cls_token = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_token, x), dim=1)    # Add CLS token
        # TODO: batch 크기에 맞게 cls_token을 확장하고 입력에 연결하세요.

        x = x + self.positions  # Add positional encoding
        # TODO: positional encoding을 더하세요.

        return x

# Test
if __name__ == "__main__":
    x = torch.randn(8, 3, 224, 224)
    patch_emb = PatchEmbedding()
    out = patch_emb(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Expected: (8, 197, 768)') # 196 patches + 1 CLS token

Input shape: torch.Size([8, 3, 224, 224])
Output shape: torch.Size([8, 197, 768])
Expected: (8, 197, 768)


# 3. Multi Head Attention (MHA)

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.scale = self.head_dim ** -0.5

        assert emb_size % num_heads == 0

        # Q, K, V projections
        self.qkv = nn.Linear(emb_size, emb_size * 3, bias=False)
        self.proj = nn.Linear(emb_size, emb_size, bias=True)
        # TODO: Q, K, V를 한 번에 만드는 선형레이어와
        #       출력 투영 선형레이어를 선언하세요.
        #       bias는 qkv에선 False로, proj에선 True(기본)로 둡니다.
        self.dropout = nn.Dropout(dropout)
        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.xavier_uniform_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        # Generate Q, K, V
        x = self.qkv(x)
        x = x.reshape(B, N, 3, self.num_heads, self.head_dim)
        x = x.permute(2, 0, 3, 1, 4)
        q, k, v = torch.chunk(x, 3, dim=0)
        q = q.squeeze(0)
        k = k.squeeze(0)
        v = v.squeeze(0)

        # TODO: qkv 선형레이어로 Q,K,V 생성 후 (B, N, 3, num_heads, head_dim)으로 reshape 하고,
        #       (3, B, num_heads, N, head_dim)으로 permute하여 q,k,v로 분리하세요.

        # Attention computation
        x = (q @ k.transpose(-2,-1)) * self.scale
        x = x.softmax(dim=-1)
        x = self.dropout(x)
        # TODO: scaled dot-product attention 계산 후 softmax, dropout 적용

        # Apply attention to values
        x = x @ v   # (B, num_heads, N, head_dim)
        x = x.transpose(1, 2)
        x = x.reshape(B, N, C)
        x = self.proj(x)
        x = self.dropout(x)
        # TODO: attention 결과에 v를 곱하고,
        #       (B, N, C) 형태로 reshape 후 proj와 dropout 적용

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 197, 768)  # (batch, patches+cls, emb_size)
    mha = MultiHeadAttention()
    out = mha(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in mha.parameters()):,}')

Input shape: torch.Size([8, 197, 768])
Output shape: torch.Size([8, 197, 768])
Parameters: 2,360,064


# 4. Transformer Encoder Block

In [10]:
class MLP(nn.Module):
    def __init__(self, emb_size=768, mlp_ratio=4, dropout=0.1):
        super().__init__()
        hidden_size = int(emb_size * mlp_ratio)

        self.net = nn.Sequential(
            nn.Linear(emb_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_size, emb_size)
            # TODO: nn.Sequential을 활용해
            #       emb_size -> hidden_size -> emb_size 순서의 MLP를 만드세요.
            #       중간에 GELU와 Dropout 포함
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        return self.net(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.scale = self.head_dim ** -0.5

        assert emb_size % num_heads == 0

        # Q, K, V projections
        self.qkv = nn.Linear(emb_size, emb_size * 3, bias=False)
        self.proj = nn.Linear(emb_size, emb_size, bias=True)
        # TODO: Q, K, V를 한 번에 만드는 선형레이어와
        #       출력 투영 선형레이어를 선언하세요.
        #       bias는 qkv에선 False로, proj에선 True(기본)로 둡니다.
        self.dropout = nn.Dropout(dropout)
        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.xavier_uniform_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        # Generate Q, K, V
        x = self.qkv(x)
        x = x.reshape(B, N, 3, self.num_heads, self.head_dim)
        x = x.permute(2, 0, 3, 1, 4)
        q, k, v = torch.chunk(x, 3, dim=0)
        q = q.squeeze(0)
        k = k.squeeze(0)
        v = v.squeeze(0)

        # TODO: qkv 선형레이어로 Q,K,V 생성 후 (B, N, 3, num_heads, head_dim)으로 reshape 하고,
        #       (3, B, num_heads, N, head_dim)으로 permute하여 q,k,v로 분리하세요.

        # Attention computation
        x = (q @ k.transpose(-2,-1)) * self.scale
        x = x.softmax(dim=-1)
        x = self.dropout(x)
        # TODO: scaled dot-product attention 계산 후 softmax, dropout 적용

        # Apply attention to values
        x = x @ v   # (B, num_heads, N, head_dim)
        x = x.transpose(1, 2)
        x = x.reshape(B, N, C)
        x = self.proj(x)
        x = self.dropout(x)
        # TODO: attention 결과에 v를 곱하고,
        #       (B, N, C) 형태로 reshape 후 proj와 dropout 적용

        return x



class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size=768, num_heads=12, mlp_ratio=4, dropout=0.1):
        super().__init__()

        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)

        self.attention = MultiHeadAttention(emb_size, num_heads, dropout)
        self.mlp = MLP(emb_size, mlp_ratio, dropout)

        # TODO: LayerNorm 2개, MultiHeadAttention, MLP 선언

    def forward(self, x: Tensor) -> Tensor:
        # Pre-norm + residual connection for attention
        x = x + self.attention(self.norm1(x))

        # Pre-norm + residual connection for MLP
        x = x + self.mlp(self.norm2(x))

        return x


# Test
if __name__ == "__main__":
    x = torch.randn(8, 197, 768)
    block = TransformerEncoderBlock()
    out = block(x)
    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in block.parameters()):,}')

Input shape: torch.Size([8, 197, 768])
Output shape: torch.Size([8, 197, 768])
Parameters: 7,085,568


# 5. Complete ViT

In [12]:
class VisionTransformer(nn.Module):
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        emb_size=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        dropout=0.1,
        drop_path=0.0
    ):
        super().__init__()

        # Patch embedding (from Stage 2)
        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size, img_size)

        # Transformer encoder blocks (from Stage 4)
        self.blocks = nn.ModuleList([
            TransformerEncoderBlock(emb_size, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        # Classification head
        self.norm = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, num_classes)

        self._init_weights()

    def _init_weights(self):
        nn.init.trunc_normal_(self.head.weight, std=0.02)
        nn.init.constant_(self.head.bias, 0)

    def forward(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)
        # TODO: patch embedding 적용

        for block in self.blocks :
            x = block(x)
        # TODO: Transformer encoder blocks 순차 적용

        # Classification head (use CLS token)
        x = self.norm(x)
        cls_token = x[:, 0]  # Extract CLS token
        x = self.head(cls_token)

        return x


# Test
if __name__ == "__main__":
    # ViT-Base configuration
    model = VisionTransformer(
        img_size=224,
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        emb_size=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        dropout=0.1
    )

    x = torch.randn(2, 3, 224, 224)
    out = model(x)

    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Total parameters: {sum(p.numel() for p in model.parameters()):,}')

    # Different ViT configurations
    print('\n=== ViT Configurations ===')
    configs = {
        'ViT-Tiny': {'emb_size': 192, 'depth': 12, 'num_heads': 3},
        'ViT-Small': {'emb_size': 384, 'depth': 12, 'num_heads': 6},
        'ViT-Base': {'emb_size': 768, 'depth': 12, 'num_heads': 12},
        'ViT-Large': {'emb_size': 1024, 'depth': 24, 'num_heads': 16},
    }

    for name, config in configs.items():
        model = VisionTransformer(**config, num_classes=1000)
        params = sum(p.numel() for p in model.parameters())
        print(f'{name}: {params:,} parameters')

Input shape: torch.Size([2, 3, 224, 224])
Output shape: torch.Size([2, 1000])
Total parameters: 86,540,008

=== ViT Configurations ===
ViT-Tiny: 5,710,504 parameters
ViT-Small: 22,036,840 parameters
ViT-Base: 86,540,008 parameters
ViT-Large: 304,252,904 parameters


# 6. ViT for CIFAR-10

위의 코드를 완성했다면, 아래 코드를 실행하여 전체 모델을 테스트할 수 있습니다.

In [21]:
class ViTCIFAR10(nn.Module):
    def __init__(
        self,
        img_size=32,
        patch_size=4,
        in_channels=3,
        num_classes=10,
        emb_size=256,
        depth=6,
        num_heads=8,
        mlp_ratio=4,
        dropout=0.1
    ):
        super().__init__()

        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size, img_size)

        self.blocks = nn.ModuleList([
            TransformerEncoderBlock(emb_size, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        self.norm = nn.LayerNorm(emb_size)
        self.head = nn.Linear(emb_size, num_classes)

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)
            elif isinstance(m, nn.Conv2d):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        if hasattr(self.patch_embed, 'cls_token'):
            nn.init.trunc_normal_(self.patch_embed.cls_token, std=0.02)
        if hasattr(self.patch_embed, 'positions'):
            nn.init.trunc_normal_(self.patch_embed.positions, std=0.02)

    def forward(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)
        cls_token = x[:, 0]
        x = self.head(cls_token)

        return x


def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        running_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}: Loss {loss.item():.4f}, Acc {100.*correct/total:.2f}%')

    return running_loss / len(dataloader), 100. * correct / total


def test(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

    test_loss /= len(dataloader)
    accuracy = 100. * correct / total

    return test_loss, accuracy


def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    transform_train = transforms.Compose([
        transforms.RandomResizedCrop(32, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4, pin_memory=True)

    model = ViTCIFAR10(
        img_size=32,
        patch_size=4,
        num_classes=10,
        emb_size=256,
        depth=6,
        num_heads=4,
        dropout=0.1
    ).to(device)

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(
        model.parameters(),
        lr=5e-4,
        weight_decay=0.03,
        betas=(0.9, 0.999)
    )

    num_epochs = 30
    warmup_epochs = 1
    total_steps = len(train_loader) * num_epochs
    warmup_steps = len(train_loader) * warmup_epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

    best_acc = 0

    for epoch in tqdm(range(num_epochs)):
        print(f'\nEpoch {epoch+1}/{num_epochs}')

        start_time = time.time()
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = test(model, test_loader, criterion, device)

        for _ in range(len(train_loader)):
            scheduler.step()

        epoch_time = time.time() - start_time
        current_lr = optimizer.param_groups[0]['lr']

        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
        print(f'LR: {current_lr:.6f}, Epoch time: {epoch_time:.2f}s')

        if test_acc > best_acc:
            best_acc = test_acc
            torch.save(model.state_dict(), 'vit_cifar10_best.pth')

        if test_acc > 90.0:
            print(f"Reached target accuracy!")
            break

    print(f'\nBest Test Accuracy: {best_acc:.2f}%')


if __name__ == "__main__":
    model = ViTCIFAR10(emb_size=256, depth=6, num_heads=4)
    x = torch.randn(4, 3, 32, 32)
    out = model(x)

    print(f'Input shape: {x.shape}')
    print(f'Output shape: {out.shape}')
    print(f'Parameters: {sum(p.numel() for p in model.parameters()):,}')

    main()

Input shape: torch.Size([4, 3, 32, 32])
Output shape: torch.Size([4, 10])
Parameters: 4,766,474


  0%|          | 0/30 [00:00<?, ?it/s]


Epoch 1/30
Batch 0: Loss 2.3533, Acc 9.77%
Batch 100: Loss 2.3398, Acc 11.17%


  3%|▎         | 1/30 [00:44<21:21, 44.20s/it]

Train Loss: 2.3290, Train Acc: 11.34%
Test Loss: 2.3277, Test Acc: 11.89%
LR: 0.000500, Epoch time: 44.15s

Epoch 2/30
Batch 0: Loss 2.3341, Acc 10.94%
Batch 100: Loss 1.9605, Acc 26.14%


  7%|▋         | 2/30 [01:26<20:09, 43.20s/it]

Train Loss: 1.9489, Train Acc: 30.16%
Test Loss: 1.7733, Test Acc: 39.28%
LR: 0.000499, Epoch time: 42.45s

Epoch 3/30
Batch 0: Loss 1.7866, Acc 38.67%
Batch 100: Loss 1.7769, Acc 39.26%


 10%|█         | 3/30 [02:10<19:38, 43.65s/it]

Train Loss: 1.7381, Train Acc: 41.43%
Test Loss: 1.6473, Test Acc: 46.47%
LR: 0.000494, Epoch time: 44.13s

Epoch 4/30
Batch 0: Loss 1.7577, Acc 46.88%
Batch 100: Loss 1.5907, Acc 46.93%


 13%|█▎        | 4/30 [02:54<18:52, 43.57s/it]

Train Loss: 1.6079, Train Acc: 48.44%
Test Loss: 1.5705, Test Acc: 50.38%
LR: 0.000487, Epoch time: 43.41s

Epoch 5/30
Batch 0: Loss 1.5572, Acc 50.00%
Batch 100: Loss 1.5119, Acc 50.89%


 17%|█▋        | 5/30 [03:37<18:09, 43.58s/it]

Train Loss: 1.5401, Train Acc: 51.72%
Test Loss: 1.5192, Test Acc: 52.65%
LR: 0.000477, Epoch time: 43.52s

Epoch 6/30
Batch 0: Loss 1.4863, Acc 54.30%
Batch 100: Loss 1.5363, Acc 53.62%


 20%|██        | 6/30 [04:21<17:26, 43.61s/it]

Train Loss: 1.5004, Train Acc: 53.57%
Test Loss: 1.4643, Test Acc: 55.04%
LR: 0.000464, Epoch time: 43.63s

Epoch 7/30
Batch 0: Loss 1.4249, Acc 58.98%
Batch 100: Loss 1.3975, Acc 55.34%


 23%|██▎       | 7/30 [05:05<16:41, 43.56s/it]

Train Loss: 1.4678, Train Acc: 55.27%
Test Loss: 1.4641, Test Acc: 55.82%
LR: 0.000449, Epoch time: 43.42s

Epoch 8/30
Batch 0: Loss 1.4750, Acc 55.08%
Batch 100: Loss 1.4033, Acc 56.40%


 27%|██▋       | 8/30 [05:49<16:02, 43.73s/it]

Train Loss: 1.4338, Train Acc: 57.13%
Test Loss: 1.4693, Test Acc: 56.28%
LR: 0.000431, Epoch time: 44.05s

Epoch 9/30
Batch 0: Loss 1.4133, Acc 59.77%
Batch 100: Loss 1.4163, Acc 58.28%


 30%|███       | 9/30 [06:32<15:16, 43.66s/it]

Train Loss: 1.4101, Train Acc: 57.97%
Test Loss: 1.4211, Test Acc: 57.28%
LR: 0.000412, Epoch time: 43.44s

Epoch 10/30
Batch 0: Loss 1.3806, Acc 57.03%
Batch 100: Loss 1.4415, Acc 59.13%


 33%|███▎      | 10/30 [07:16<14:32, 43.65s/it]

Train Loss: 1.3839, Train Acc: 59.33%
Test Loss: 1.3950, Test Acc: 58.62%
LR: 0.000390, Epoch time: 43.58s

Epoch 11/30
Batch 0: Loss 1.3819, Acc 56.25%
Batch 100: Loss 1.3042, Acc 60.39%


 37%|███▋      | 11/30 [08:00<13:50, 43.72s/it]

Train Loss: 1.3558, Train Acc: 60.82%
Test Loss: 1.3792, Test Acc: 59.56%
LR: 0.000367, Epoch time: 43.84s

Epoch 12/30
Batch 0: Loss 1.3044, Acc 64.06%
Batch 100: Loss 1.3803, Acc 61.51%


 40%|████      | 12/30 [08:43<13:04, 43.59s/it]

Train Loss: 1.3385, Train Acc: 61.60%
Test Loss: 1.3880, Test Acc: 60.13%
LR: 0.000343, Epoch time: 43.23s

Epoch 13/30
Batch 0: Loss 1.3345, Acc 62.89%
Batch 100: Loss 1.4197, Acc 62.30%


 43%|████▎     | 13/30 [09:27<12:21, 43.60s/it]

Train Loss: 1.3164, Train Acc: 62.61%
Test Loss: 1.3621, Test Acc: 60.60%
LR: 0.000317, Epoch time: 43.58s

Epoch 14/30
Batch 0: Loss 1.3009, Acc 64.84%
Batch 100: Loss 1.2477, Acc 63.74%


 47%|████▋     | 14/30 [10:10<11:37, 43.61s/it]

Train Loss: 1.2869, Train Acc: 63.95%
Test Loss: 1.2910, Test Acc: 63.54%
LR: 0.000290, Epoch time: 43.57s

Epoch 15/30
Batch 0: Loss 1.3124, Acc 62.89%
Batch 100: Loss 1.3135, Acc 65.05%


 50%|█████     | 15/30 [10:54<10:52, 43.52s/it]

Train Loss: 1.2616, Train Acc: 65.04%
Test Loss: 1.3362, Test Acc: 62.02%
LR: 0.000264, Epoch time: 43.30s

Epoch 16/30
Batch 0: Loss 1.1430, Acc 69.92%
Batch 100: Loss 1.2600, Acc 65.69%


 53%|█████▎    | 16/30 [11:37<10:11, 43.65s/it]

Train Loss: 1.2465, Train Acc: 65.79%
Test Loss: 1.2913, Test Acc: 63.73%
LR: 0.000236, Epoch time: 43.93s

Epoch 17/30
Batch 0: Loss 1.1527, Acc 67.19%
Batch 100: Loss 1.1159, Acc 66.49%


 57%|█████▋    | 17/30 [12:21<09:25, 43.52s/it]

Train Loss: 1.2247, Train Acc: 66.66%
Test Loss: 1.2332, Test Acc: 66.86%
LR: 0.000210, Epoch time: 43.15s

Epoch 18/30
Batch 0: Loss 1.1589, Acc 70.70%
Batch 100: Loss 1.2698, Acc 68.08%


 60%|██████    | 18/30 [13:04<08:41, 43.49s/it]

Train Loss: 1.2008, Train Acc: 68.01%
Test Loss: 1.2283, Test Acc: 66.44%
LR: 0.000183, Epoch time: 43.41s

Epoch 19/30
Batch 0: Loss 1.1592, Acc 68.36%
Batch 100: Loss 1.2030, Acc 68.92%


 63%|██████▎   | 19/30 [13:48<08:00, 43.69s/it]

Train Loss: 1.1789, Train Acc: 69.02%
Test Loss: 1.2133, Test Acc: 67.73%
LR: 0.000157, Epoch time: 44.12s

Epoch 20/30
Batch 0: Loss 1.0872, Acc 72.66%
Batch 100: Loss 1.1561, Acc 69.79%


 67%|██████▋   | 20/30 [14:32<07:15, 43.58s/it]

Train Loss: 1.1536, Train Acc: 70.17%
Test Loss: 1.1927, Test Acc: 68.16%
LR: 0.000133, Epoch time: 43.27s

Epoch 21/30
Batch 0: Loss 1.1287, Acc 69.92%
Batch 100: Loss 1.0940, Acc 71.16%


 70%|███████   | 21/30 [15:15<06:31, 43.48s/it]

Train Loss: 1.1371, Train Acc: 71.01%
Test Loss: 1.2071, Test Acc: 67.83%
LR: 0.000110, Epoch time: 43.26s

Epoch 22/30
Batch 0: Loss 1.0613, Acc 75.78%
Batch 100: Loss 1.2173, Acc 72.10%


 73%|███████▎  | 22/30 [15:59<05:48, 43.61s/it]

Train Loss: 1.1178, Train Acc: 72.01%
Test Loss: 1.2079, Test Acc: 68.10%
LR: 0.000088, Epoch time: 43.90s

Epoch 23/30
Batch 0: Loss 1.0977, Acc 69.92%
Batch 100: Loss 1.1111, Acc 73.11%


 77%|███████▋  | 23/30 [16:42<05:05, 43.59s/it]

Train Loss: 1.0977, Train Acc: 72.84%
Test Loss: 1.1663, Test Acc: 69.37%
LR: 0.000069, Epoch time: 43.50s

Epoch 24/30
Batch 0: Loss 1.0518, Acc 76.17%
Batch 100: Loss 1.1036, Acc 74.04%


 80%|████████  | 24/30 [17:26<04:22, 43.67s/it]

Train Loss: 1.0801, Train Acc: 73.87%
Test Loss: 1.1588, Test Acc: 70.24%
LR: 0.000051, Epoch time: 43.80s

Epoch 25/30
Batch 0: Loss 1.0344, Acc 76.17%
Batch 100: Loss 1.1219, Acc 74.44%


 83%|████████▎ | 25/30 [18:10<03:38, 43.63s/it]

Train Loss: 1.0632, Train Acc: 74.32%
Test Loss: 1.1619, Test Acc: 70.00%
LR: 0.000036, Epoch time: 43.53s

Epoch 26/30
Batch 0: Loss 1.0024, Acc 77.34%
Batch 100: Loss 1.0354, Acc 75.07%


 87%|████████▋ | 26/30 [18:53<02:54, 43.53s/it]

Train Loss: 1.0524, Train Acc: 74.89%
Test Loss: 1.1527, Test Acc: 70.73%
LR: 0.000023, Epoch time: 43.24s

Epoch 27/30
Batch 0: Loss 1.0178, Acc 76.56%
Batch 100: Loss 0.9869, Acc 75.80%


 90%|█████████ | 27/30 [19:37<02:11, 43.75s/it]

Train Loss: 1.0381, Train Acc: 75.77%
Test Loss: 1.1442, Test Acc: 70.98%
LR: 0.000013, Epoch time: 44.22s

Epoch 28/30
Batch 0: Loss 0.9816, Acc 78.12%
Batch 100: Loss 1.0609, Acc 76.18%


 93%|█████████▎| 28/30 [20:21<01:27, 43.64s/it]

Train Loss: 1.0302, Train Acc: 75.94%
Test Loss: 1.1435, Test Acc: 71.11%
LR: 0.000006, Epoch time: 43.33s

Epoch 29/30
Batch 0: Loss 1.0391, Acc 75.39%
Batch 100: Loss 1.0250, Acc 76.42%


 97%|█████████▋| 29/30 [21:04<00:43, 43.62s/it]

Train Loss: 1.0241, Train Acc: 76.33%
Test Loss: 1.1398, Test Acc: 71.25%
LR: 0.000001, Epoch time: 43.51s

Epoch 30/30
Batch 0: Loss 0.9441, Acc 80.08%
Batch 100: Loss 1.0368, Acc 76.24%


100%|██████████| 30/30 [21:48<00:00, 43.61s/it]

Train Loss: 1.0248, Train Acc: 76.41%
Test Loss: 1.1433, Test Acc: 71.04%
LR: 0.000000, Epoch time: 43.59s

Best Test Accuracy: 71.25%





ViT는 일반적으로 대규모 데이터셋에서 사전 학습된(pretrained) 모델을 활용하는 경우가 많기 때문에, 하이퍼파라미터를 조정하거나 학습 epoch을 늘리면 성능이 개선될 수는 있지만, 소규모 데이터셋에서 처음부터 학습한 ViT의 성능이 낮은 것은 구조적 한계에 가깝습니다.