In [None]:
# === Cell 1: Shared Cache Bootstrap ===
import os, pathlib, torch

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(
        f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    )

In [None]:
# === Cell 2: Import Dependencies ===
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from tqdm import tqdm
import time

# Set device with memory-efficient defaults
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# === Cell 3: CIFAR-10 Dataset Loading & Preprocessing ===
# CIFAR-10 classes
classes = (
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
)

# Data transforms with augmentation for training
transform_train = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]
)

transform_test = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]
)

# Download and load datasets (cached in AI_CACHE_ROOT)
trainset = torchvision.datasets.CIFAR10(
    root=f"{AI_CACHE_ROOT}/data", train=True, download=True, transform=transform_train
)
testset = torchvision.datasets.CIFAR10(
    root=f"{AI_CACHE_ROOT}/data", train=False, download=True, transform=transform_test
)

# Low-VRAM friendly batch sizes
batch_size = 64 if torch.cuda.is_available() else 32
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

print(f"Training samples: {len(trainset)}")
print(f"Test samples: {len(testset)}")
print(f"Batch size: {batch_size}")


# Visualize sample images
def show_sample_images(dataloader, num_images=8):
    dataiter = iter(dataloader)
    images, labels = next(dataiter)

    fig, axes = plt.subplots(2, 4, figsize=(12, 6))
    for i in range(num_images):
        ax = axes[i // 4, i % 4]
        # Denormalize for display
        img = images[i] * torch.tensor([0.2023, 0.1994, 0.2010]).view(3, 1, 1)
        img += torch.tensor([0.4914, 0.4822, 0.4465]).view(3, 1, 1)
        img = torch.clamp(img, 0, 1)

        ax.imshow(img.permute(1, 2, 0))
        ax.set_title(f"{classes[labels[i]]}")
        ax.axis("off")
    plt.tight_layout()
    plt.show()


show_sample_images(trainloader)

In [None]:
# === Cell 4: CNN Model Architecture ===
class SimpleCNN(nn.Module):
    """
    Simple CNN for CIFAR-10 classification
    Features: Conv layers, BatchNorm, Dropout, Global Average Pooling
    """

    def __init__(self, num_classes=10, dropout_rate=0.3):
        super(SimpleCNN, self).__init__()

        # Feature extraction layers
        self.features = nn.Sequential(
            # Block 1: 32x32 -> 16x16
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            # Block 2: 16x16 -> 8x8
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            # Block 3: 8x8 -> 4x4
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
        )

        # Classifier with Global Average Pooling
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(64, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


# Initialize model
model = SimpleCNN(num_classes=10).to(device)


# Model summary
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"Model parameters: {count_parameters(model):,}")
print(f"Model size: {count_parameters(model) * 4 / 1e6:.2f} MB (fp32)")

In [None]:
# === Cell 5: Training Configuration ===
# Training hyperparameters (low-VRAM friendly)
num_epochs = 10
learning_rate = 0.001
weight_decay = 1e-4

# Optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
criterion = nn.CrossEntropyLoss()

# Training tracking
train_losses = []
train_accuracies = []
val_accuracies = []


def evaluate_model(model, dataloader, device):
    """Evaluate model on given dataloader"""
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0

    with torch.no_grad():
        for data, targets in dataloader:
            data, targets = data.to(device), targets.to(device)
            outputs = model(data)
            loss = criterion(outputs, targets)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    avg_loss = running_loss / len(dataloader)
    return accuracy, avg_loss

In [None]:
# === Cell 6: Training Loop ===
print("Starting training...")
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # Training loop with progress bar
    pbar = tqdm(trainloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for i, (inputs, labels) in enumerate(pbar):
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Statistics
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Update progress bar
        if i % 100 == 99:
            avg_loss = running_loss / 100
            accuracy = 100 * correct / total
            pbar.set_postfix({"Loss": f"{avg_loss:.3f}", "Acc": f"{accuracy:.1f}%"})
            running_loss = 0.0

    # Epoch statistics
    train_acc = 100 * correct / total
    val_acc, val_loss = evaluate_model(model, testloader, device)

    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(
        f"Epoch [{epoch+1}/{num_epochs}] - "
        f"Train Acc: {train_acc:.2f}% - "
        f"Val Acc: {val_acc:.2f}% - "
        f"Val Loss: {val_loss:.3f}"
    )

    # Step scheduler
    scheduler.step()

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.1f} seconds")

# Plot training curves
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_accuracies, "b-", label="Training")
plt.plot(range(1, num_epochs + 1), val_accuracies, "r-", label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy (%)")
plt.title("Training & Validation Accuracy")
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(
    range(1, num_epochs + 1),
    [scheduler.get_last_lr()[0] * (0.5 ** (i // 5)) for i in range(num_epochs)],
)
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.title("Learning Rate Schedule")
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# === Cell 7: Model Evaluation & Visualization ===
# Final evaluation on test set
model.eval()
all_predicted = []
all_labels = []
all_outputs = []

print("Evaluating on test set...")
with torch.no_grad():
    for data, targets in tqdm(testloader):
        data, targets = data.to(device), targets.to(device)
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)

        all_predicted.extend(predicted.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())
        all_outputs.extend(F.softmax(outputs, dim=1).cpu().numpy())

# Classification report
print("\n=== Classification Report ===")
print(classification_report(all_labels, all_predicted, target_names=classes))

# Confusion matrix
cm = confusion_matrix(all_labels, all_predicted)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes
)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Per-class accuracy
class_accuracy = cm.diagonal() / cm.sum(axis=1)
for i, acc in enumerate(class_accuracy):
    print(f"{classes[i]}: {acc:.3f}")


# Visualize predictions
def show_predictions(model, testloader, device, num_images=8):
    model.eval()
    dataiter = iter(testloader)
    images, labels = next(dataiter)
    images, labels = images.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        probabilities = F.softmax(outputs, dim=1)

    fig, axes = plt.subplots(2, 4, figsize=(15, 8))
    for i in range(num_images):
        ax = axes[i // 4, i % 4]

        # Denormalize image for display
        img = images[i].cpu()
        img = img * torch.tensor([0.2023, 0.1994, 0.2010]).view(3, 1, 1)
        img += torch.tensor([0.4914, 0.4822, 0.4465]).view(3, 1, 1)
        img = torch.clamp(img, 0, 1)

        ax.imshow(img.permute(1, 2, 0))

        true_label = classes[labels[i]]
        pred_label = classes[predicted[i]]
        confidence = probabilities[i][predicted[i]].item()

        color = "green" if labels[i] == predicted[i] else "red"
        ax.set_title(
            f"True: {true_label}\nPred: {pred_label} ({confidence:.2f})",
            color=color,
            fontsize=10,
        )
        ax.axis("off")

    plt.tight_layout()
    plt.show()


show_predictions(model, testloader, device)

# Save model checkpoint
model_path = f"{AI_CACHE_ROOT}/models"
pathlib.Path(model_path).mkdir(parents=True, exist_ok=True)
torch.save(
    {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": num_epochs,
        "val_accuracy": val_accuracies[-1],
        "model_config": {"num_classes": 10, "dropout_rate": 0.3},
    },
    f"{model_path}/cifar10_cnn_checkpoint.pth",
)

print(f"Model saved to {model_path}/cifar10_cnn_checkpoint.pth")

In [None]:
# === Cell 8: Smoke Test ===
print("=== Smoke Test: Model Loading & Inference ===")

# Test model loading
checkpoint = torch.load(f"{model_path}/cifar10_cnn_checkpoint.pth", map_location=device)
test_model = SimpleCNN(**checkpoint["model_config"]).to(device)
test_model.load_state_dict(checkpoint["model_state_dict"])

# Test inference on a single batch
test_model.eval()
with torch.no_grad():
    test_input = torch.randn(1, 3, 32, 32).to(device)
    test_output = test_model(test_input)
    test_prediction = torch.argmax(test_output, dim=1)

print(f"✅ Model loading: SUCCESS")
print(f"✅ Inference shape: {test_output.shape}")
print(f"✅ Prediction class: {classes[test_prediction.item()]}")

# Memory usage summary
if torch.cuda.is_available():
    memory_used = torch.cuda.max_memory_allocated() / 1e9
    print(f"✅ Peak GPU memory: {memory_used:.2f} GB")

print(f"✅ Final validation accuracy: {val_accuracies[-1]:.2f}%")
print("✅ All tests passed!")

In [None]:
# === Cell 9: Usage Notes & Extensions ===
print(
    """
=== 何時使用 CNN (When to use CNN) ===

🎯 適用場景 (Use Cases):
• 影像分類、物件偵測、影像分割
• 任何具有空間結構的資料 (圖片、醫學影像、衛星圖)
• 需要平移不變性 (translation invariance) 的任務

⚙️ 關鍵參數調整 (Key Parameters):
• batch_size: 根據 GPU 記憶體調整 (4GB: 32, 8GB: 64, 16GB+: 128)
• learning_rate: 0.001 (Adam) 或 0.01 (SGD)
• dropout_rate: 0.2-0.5 防止過擬合
• data_augmentation: 小資料集必用

🚀 效能優化 (Performance Tips):
• 使用 BatchNorm 加速收斂
• Global Average Pooling 減少參數
• Mixed precision training (torch.cuda.amp)
• 梯度累積處理大 batch size

🔧 常見問題 (Common Issues):
• 過擬合: 增加 Dropout、Data Augmentation
• 收斂慢: 檢查學習率、使用預訓練模型
• 記憶體不足: 減少 batch_size、使用 gradient checkpointing
"""
)

In [None]:
# === Smoke Test Cell (5 lines) ===
assert torch.cuda.is_available() or True  # Works on CPU too
assert model.training == False  # Model in eval mode
assert val_accuracies[-1] > 50.0  # Reasonable accuracy threshold
assert os.path.exists(f"{AI_CACHE_ROOT}/models/cifar10_cnn_checkpoint.pth")
print("✅ All smoke tests passed!")



## 6. 本章小結

### ✅ 完成項目 (Completed Items)
- **CNN 架構實作**：包含卷積、批次正規化、池化、Dropout 的完整模型
- **CIFAR-10 訓練流程**：資料載入、增強、訓練、評估的端對端流程
- **低 VRAM 優化**：支援 4GB+ GPU，包含 CPU 後備方案
- **模型評估**：準確率、混淆矩陣、分類報告、預測視覺化
- **模型持久化**：檢查點儲存與載入機制

### 🧠 核心原理 (Core Concepts)
- **空間不變性 (Spatial Invariance)**：CNN 透過權重共享學習局部特徵
- **階層式特徵學習**：淺層學習邊緣，深層學習複雜模式
- **正規化技術**：BatchNorm 加速訓練，Dropout 防止過擬合
- **資料增強**：RandomFlip、Rotation 提升模型泛化能力

### ⚠️ 常見陷阱 (Common Pitfalls)
- **記憶體爆炸**：batch_size 過大導致 CUDA OOM
- **梯度消失**：網路過深時使用 ResNet 或 BatchNorm
- **過擬合**：小資料集時必須使用 Dropout 和 Data Augmentation
- **學習率設定**：過大導致震盪，過小導致收斂慢

### 🚀 下一步建議 (Next Steps)
1. **轉移學習**：使用預訓練 ResNet/EfficientNet 提升效果
2. **進階增強**：CutMix、MixUp、AutoAugment 技術
3. **架構搜索**：嘗試 MobileNet、ShuffleNet 等輕量化模型
4. **多 GPU 訓練**：DataParallel 或 DistributedDataParallel

**準備進入 nb05_lstm_text_generation.ipynb (LSTM 文字生成)，或您想優先學習其他主題？**