# 06 組裝 LeNet 網路

## 學習目標

1. 將所有組件組裝成完整的 LeNet-like CNN
2. 實作完整的訓練迴圈
3. 在簡單資料集上訓練並驗證 loss 下降
4. 理解各層的維度變化

## LeNet 架構回顧

原始 LeNet-5 (LeCun 1998) 的架構：

```
Input (1, 32, 32)
    ↓
Conv2D(6, 5x5) → ReLU → MaxPool(2x2)
    ↓ (6, 14, 14)
Conv2D(16, 5x5) → ReLU → MaxPool(2x2)
    ↓ (16, 5, 5)
Flatten
    ↓ (400,)
FC(120) → ReLU
    ↓ (120,)
FC(84) → ReLU
    ↓ (84,)
FC(10) → Softmax
    ↓ (10,)
Output
```

我們將實作一個簡化版本，適用於 28x28 輸入（如 MNIST 風格的資料）。

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
print("LeNet module loaded!")

## 第一部分：收集所有組件

將前面實作的所有層整合在一起。

In [None]:
# ================== 層定義 ==================

class Conv2D:
    """2D 卷積層"""
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        kH, kW = self.kernel_size
        std = np.sqrt(2.0 / (in_channels * kH * kW))
        self.W = np.random.randn(out_channels, in_channels, kH, kW) * std
        self.b = np.zeros(out_channels)
        self.dW = None
        self.db = None
        self.cache = None
    
    def forward(self, X):
        N, C_in, H, W_in = X.shape
        C_out = self.out_channels
        kH, kW = self.kernel_size
        S, P = self.stride, self.padding
        
        if P > 0:
            X_pad = np.pad(X, ((0,0), (0,0), (P,P), (P,P)), mode='constant')
        else:
            X_pad = X
        
        _, _, H_pad, W_pad = X_pad.shape
        H_out = (H_pad - kH) // S + 1
        W_out = (W_pad - kW) // S + 1
        
        Y = np.zeros((N, C_out, H_out, W_out))
        
        for n in range(N):
            for c_out in range(C_out):
                for i in range(H_out):
                    for j in range(W_out):
                        h_s, w_s = i * S, j * S
                        Y[n, c_out, i, j] = np.sum(
                            X_pad[n, :, h_s:h_s+kH, w_s:w_s+kW] * self.W[c_out]
                        ) + self.b[c_out]
        
        self.cache = (X, X_pad)
        return Y
    
    def backward(self, dY):
        X, X_pad = self.cache
        N, C_in, H, W_in = X.shape
        C_out = self.out_channels
        kH, kW = self.kernel_size
        S, P = self.stride, self.padding
        _, _, H_out, W_out = dY.shape
        
        dX_pad = np.zeros_like(X_pad)
        self.dW = np.zeros_like(self.W)
        self.db = np.sum(dY, axis=(0, 2, 3))
        
        for n in range(N):
            for c_out in range(C_out):
                for i in range(H_out):
                    for j in range(W_out):
                        h_s, w_s = i * S, j * S
                        self.dW[c_out] += dY[n, c_out, i, j] * X_pad[n, :, h_s:h_s+kH, w_s:w_s+kW]
                        dX_pad[n, :, h_s:h_s+kH, w_s:w_s+kW] += dY[n, c_out, i, j] * self.W[c_out]
        
        if P > 0:
            dX = dX_pad[:, :, P:-P, P:-P]
        else:
            dX = dX_pad
        return dX


class MaxPool2D:
    """Max Pooling 層"""
    def __init__(self, pool_size, stride=None):
        self.pool_size = pool_size if isinstance(pool_size, tuple) else (pool_size, pool_size)
        self.stride = stride if stride else self.pool_size[0]
        self.cache = None
    
    def forward(self, X):
        N, C, H, W = X.shape
        kH, kW = self.pool_size
        S = self.stride
        H_out = (H - kH) // S + 1
        W_out = (W - kW) // S + 1
        
        Y = np.zeros((N, C, H_out, W_out))
        max_idx = np.zeros((N, C, H_out, W_out, 2), dtype=int)
        
        for n in range(N):
            for c in range(C):
                for i in range(H_out):
                    for j in range(W_out):
                        h_s, w_s = i * S, j * S
                        window = X[n, c, h_s:h_s+kH, w_s:w_s+kW]
                        Y[n, c, i, j] = np.max(window)
                        pos = np.unravel_index(np.argmax(window), window.shape)
                        max_idx[n, c, i, j] = [h_s + pos[0], w_s + pos[1]]
        
        self.cache = (X.shape, max_idx)
        return Y
    
    def backward(self, dY):
        X_shape, max_idx = self.cache
        N, C, H_out, W_out = dY.shape
        dX = np.zeros(X_shape)
        
        for n in range(N):
            for c in range(C):
                for i in range(H_out):
                    for j in range(W_out):
                        h_idx, w_idx = max_idx[n, c, i, j]
                        dX[n, c, h_idx, w_idx] += dY[n, c, i, j]
        return dX


class ReLU:
    """ReLU 激活函數"""
    def __init__(self):
        self.cache = None
    
    def forward(self, x):
        self.cache = x
        return np.maximum(0, x)
    
    def backward(self, dout):
        return dout * (self.cache > 0)


class Flatten:
    """展平層"""
    def __init__(self):
        self.cache = None
    
    def forward(self, X):
        self.cache = X.shape
        return X.reshape(X.shape[0], -1)
    
    def backward(self, dY):
        return dY.reshape(self.cache)


class FullyConnected:
    """全連接層"""
    def __init__(self, in_features, out_features):
        std = np.sqrt(2.0 / in_features)
        self.W = np.random.randn(in_features, out_features) * std
        self.b = np.zeros(out_features)
        self.dW = None
        self.db = None
        self.cache = None
    
    def forward(self, X):
        self.cache = X
        return X @ self.W + self.b
    
    def backward(self, dY):
        X = self.cache
        self.dW = X.T @ dY
        self.db = np.sum(dY, axis=0)
        return dY @ self.W.T


def softmax(z):
    """數值穩定的 Softmax"""
    z_shifted = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z_shifted)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)


class SoftmaxCrossEntropy:
    """Softmax + Cross-Entropy Loss"""
    def __init__(self):
        self.cache = None
    
    def forward(self, z, y):
        N = z.shape[0]
        p = softmax(z)
        eps = 1e-10
        loss = -np.mean(np.log(p[np.arange(N), y] + eps))
        self.cache = (p, y)
        return loss
    
    def backward(self):
        p, y = self.cache
        N = p.shape[0]
        dz = p.copy()
        dz[np.arange(N), y] -= 1
        return dz / N

print("所有組件已定義完成！")

## 第二部分：組裝 LeNet

In [None]:
class LeNet:
    """
    簡化版 LeNet CNN
    
    架構（針對 28x28 輸入）：
    - Conv(6, 5x5, padding=2) → ReLU → MaxPool(2x2)  # 28→28→14
    - Conv(16, 5x5) → ReLU → MaxPool(2x2)            # 14→10→5
    - Flatten → FC(120) → ReLU → FC(84) → ReLU → FC(num_classes)
    """
    
    def __init__(self, input_shape=(1, 28, 28), num_classes=10):
        """
        Parameters
        ----------
        input_shape : tuple (C, H, W)
            輸入影像的形狀
        num_classes : int
            類別數
        """
        C_in, H, W = input_shape
        
        # 卷積層
        self.conv1 = Conv2D(C_in, 6, 5, padding=2)  # 保持大小
        self.relu1 = ReLU()
        self.pool1 = MaxPool2D(2, stride=2)          # H/2, W/2
        
        self.conv2 = Conv2D(6, 16, 5, padding=0)     # 縮小 4
        self.relu2 = ReLU()
        self.pool2 = MaxPool2D(2, stride=2)          # H/2, W/2
        
        # 計算 flatten 後的維度
        # 28 → conv1(p=2) → 28 → pool1 → 14 → conv2(p=0) → 10 → pool2 → 5
        # 所以 flatten 大小 = 16 * 5 * 5 = 400
        self.flatten = Flatten()
        
        # 計算實際的 flatten 大小
        h1 = (H + 2*2 - 5) // 1 + 1  # conv1 with padding=2
        h2 = h1 // 2                  # pool1
        h3 = (h2 - 5) // 1 + 1        # conv2
        h4 = h3 // 2                  # pool2
        flatten_size = 16 * h4 * h4
        
        # 全連接層
        self.fc1 = FullyConnected(flatten_size, 120)
        self.relu3 = ReLU()
        self.fc2 = FullyConnected(120, 84)
        self.relu4 = ReLU()
        self.fc3 = FullyConnected(84, num_classes)
        
        self.loss_fn = SoftmaxCrossEntropy()
        
        # 所有層的列表（方便遍歷）
        self.layers = [
            self.conv1, self.relu1, self.pool1,
            self.conv2, self.relu2, self.pool2,
            self.flatten,
            self.fc1, self.relu3,
            self.fc2, self.relu4,
            self.fc3
        ]
    
    def forward(self, X):
        """
        前向傳播（返回 logits）
        """
        out = X
        for layer in self.layers:
            out = layer.forward(out)
        return out
    
    def loss(self, X, y):
        """
        計算損失
        """
        logits = self.forward(X)
        return self.loss_fn.forward(logits, y)
    
    def backward(self):
        """
        反向傳播
        """
        dout = self.loss_fn.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
    
    def get_params_and_grads(self):
        """
        獲取所有可訓練參數及其梯度
        """
        params_and_grads = []
        for layer in self.layers:
            if hasattr(layer, 'W'):
                params_and_grads.append((layer.W, layer.dW))
                params_and_grads.append((layer.b, layer.db))
        return params_and_grads
    
    def predict(self, X):
        """
        預測類別
        """
        logits = self.forward(X)
        return np.argmax(logits, axis=1)

# 測試網路結構
net = LeNet(input_shape=(1, 28, 28), num_classes=10)

# 前向傳播測試
X_test = np.random.randn(2, 1, 28, 28)
logits = net.forward(X_test)

print("=== LeNet 網路結構 ===")
print(f"輸入: (batch, 1, 28, 28)")
print(f"\n各層輸出形狀:")

out = X_test
layer_names = [
    "Conv1 (6, 5x5, p=2)", "ReLU1", "MaxPool1 (2x2)",
    "Conv2 (16, 5x5)", "ReLU2", "MaxPool2 (2x2)",
    "Flatten",
    "FC1 (120)", "ReLU3",
    "FC2 (84)", "ReLU4",
    "FC3 (10)"
]

for layer, name in zip(net.layers, layer_names):
    out = layer.forward(out)
    print(f"  {name:25s} → {str(out.shape)}")

print(f"\n輸出 (logits) 形狀: {logits.shape}")

## 第三部分：產生訓練資料

為了測試，我們產生一些簡單的合成資料。

In [None]:
def create_synthetic_dataset(n_samples=500, img_size=28, num_classes=4):
    """
    產生簡單的合成分類資料集
    
    類別:
    0: 水平線
    1: 垂直線
    2: 對角線（左上到右下）
    3: 對角線（右上到左下）
    """
    X = np.zeros((n_samples, 1, img_size, img_size))
    y = np.zeros(n_samples, dtype=int)
    
    for i in range(n_samples):
        label = i % num_classes
        y[i] = label
        
        # 隨機位置和寬度
        pos = np.random.randint(5, img_size - 5)
        width = np.random.randint(2, 4)
        
        if label == 0:  # 水平線
            X[i, 0, pos:pos+width, 3:-3] = 1
        elif label == 1:  # 垂直線
            X[i, 0, 3:-3, pos:pos+width] = 1
        elif label == 2:  # 對角線
            for j in range(3, img_size - 3):
                X[i, 0, j, j] = 1
                if width > 1:
                    X[i, 0, j, min(j+1, img_size-1)] = 1
        else:  # 反對角線
            for j in range(3, img_size - 3):
                X[i, 0, j, img_size - 1 - j] = 1
                if width > 1:
                    X[i, 0, j, max(img_size - 2 - j, 0)] = 1
        
        # 加入雜訊
        X[i, 0] += np.random.randn(img_size, img_size) * 0.1
    
    # 打亂
    perm = np.random.permutation(n_samples)
    X = X[perm]
    y = y[perm]
    
    return X, y

# 產生資料
np.random.seed(42)
X_train, y_train = create_synthetic_dataset(n_samples=400, num_classes=4)
X_val, y_val = create_synthetic_dataset(n_samples=100, num_classes=4)

print(f"訓練資料: X={X_train.shape}, y={y_train.shape}")
print(f"驗證資料: X={X_val.shape}, y={y_val.shape}")
print(f"類別分佈: {np.bincount(y_train)}")

# 視覺化樣本
fig, axes = plt.subplots(2, 4, figsize=(12, 6))
class_names = ['水平線', '垂直線', '對角線', '反對角線']

for i in range(4):
    # 找到該類別的樣本
    idx = np.where(y_train == i)[0][0]
    axes[0, i].imshow(X_train[idx, 0], cmap='gray')
    axes[0, i].set_title(f'Class {i}: {class_names[i]}')
    axes[0, i].axis('off')
    
    # 另一個樣本
    idx = np.where(y_train == i)[0][1]
    axes[1, i].imshow(X_train[idx, 0], cmap='gray')
    axes[1, i].axis('off')

plt.suptitle('Synthetic Dataset Samples', fontsize=14)
plt.tight_layout()
plt.show()

## 第四部分：訓練迴圈

In [None]:
def train_lenet(model, X_train, y_train, X_val, y_val, 
                epochs=50, batch_size=32, learning_rate=0.01):
    """
    訓練 LeNet
    
    Parameters
    ----------
    model : LeNet
    X_train, y_train : 訓練資料
    X_val, y_val : 驗證資料
    epochs : int
    batch_size : int
    learning_rate : float
    
    Returns
    -------
    history : dict
        訓練歷史
    """
    n_samples = len(y_train)
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }
    
    for epoch in range(epochs):
        # 打亂訓練資料
        perm = np.random.permutation(n_samples)
        X_shuffled = X_train[perm]
        y_shuffled = y_train[perm]
        
        epoch_loss = 0
        n_batches = 0
        
        # Mini-batch 訓練
        for i in range(0, n_samples, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            
            # Forward
            loss = model.loss(X_batch, y_batch)
            epoch_loss += loss
            n_batches += 1
            
            # Backward
            model.backward()
            
            # Update (SGD)
            for param, grad in model.get_params_and_grads():
                param -= learning_rate * grad
        
        # 計算訓練指標
        train_loss = epoch_loss / n_batches
        train_pred = model.predict(X_train)
        train_acc = np.mean(train_pred == y_train)
        
        # 計算驗證指標
        val_loss = model.loss(X_val, y_val)
        val_pred = model.predict(X_val)
        val_acc = np.mean(val_pred == y_val)
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        if epoch % 10 == 0 or epoch == epochs - 1:
            print(f"Epoch {epoch:3d}: "
                  f"Train Loss={train_loss:.4f}, Acc={train_acc:.4f} | "
                  f"Val Loss={val_loss:.4f}, Acc={val_acc:.4f}")
    
    return history

# 訓練（使用較小的資料集以加快速度）
print("開始訓練 LeNet...\n")

np.random.seed(42)
model = LeNet(input_shape=(1, 28, 28), num_classes=4)

history = train_lenet(
    model, X_train, y_train, X_val, y_val,
    epochs=50, batch_size=16, learning_rate=0.01
)

In [None]:
# 視覺化訓練過程
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Loss
ax = axes[0]
ax.plot(history['train_loss'], label='Train')
ax.plot(history['val_loss'], label='Validation')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Training & Validation Loss')
ax.legend()
ax.grid(True, alpha=0.3)

# Accuracy
ax = axes[1]
ax.plot(history['train_acc'], label='Train')
ax.plot(history['val_acc'], label='Validation')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.set_title('Training & Validation Accuracy')
ax.set_ylim(0, 1.05)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n最終驗證準確率: {history['val_acc'][-1]:.4f}")

## 第五部分：視覺化學到的特徵

In [None]:
# 視覺化第一層卷積核
fig, axes = plt.subplots(2, 3, figsize=(10, 6))

for i, ax in enumerate(axes.flat):
    kernel = model.conv1.W[i, 0]  # 取第 i 個 filter 的第一個通道
    ax.imshow(kernel, cmap='RdBu_r', vmin=-np.abs(kernel).max(), vmax=np.abs(kernel).max())
    ax.set_title(f'Conv1 Filter {i}')
    ax.axis('off')

plt.suptitle('Learned Convolutional Filters (Layer 1)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# 視覺化特徵圖
def visualize_feature_maps(model, X_sample):
    """
    視覺化各層的特徵圖
    """
    # 各層的輸出
    feature_maps = []
    out = X_sample
    
    for layer in model.layers[:6]:  # 只看卷積/池化層
        out = layer.forward(out)
        feature_maps.append(out.copy())
    
    fig, axes = plt.subplots(3, 6, figsize=(15, 8))
    
    # 原始輸入
    axes[0, 0].imshow(X_sample[0, 0], cmap='gray')
    axes[0, 0].set_title('Input')
    axes[0, 0].axis('off')
    for j in range(1, 6):
        axes[0, j].axis('off')
    
    # Conv1 後的特徵圖
    for j in range(6):
        axes[1, j].imshow(feature_maps[0][0, j], cmap='viridis')
        axes[1, j].set_title(f'Conv1 ch{j}')
        axes[1, j].axis('off')
    
    # Pool1 後的特徵圖
    for j in range(6):
        axes[2, j].imshow(feature_maps[2][0, j], cmap='viridis')
        axes[2, j].set_title(f'Pool1 ch{j}')
        axes[2, j].axis('off')
    
    plt.suptitle('Feature Maps at Different Layers', fontsize=14)
    plt.tight_layout()
    plt.show()

# 對每個類別視覺化一個樣本
for class_idx in range(4):
    sample_idx = np.where(y_val == class_idx)[0][0]
    X_sample = X_val[sample_idx:sample_idx+1]
    print(f"Class {class_idx}: {class_names[class_idx]}")
    visualize_feature_maps(model, X_sample)

## 練習題

### 練習 1：加入 Momentum SGD

實作 Momentum SGD 優化器，通常能比普通 SGD 更快收斂。

$$v_t = \beta \cdot v_{t-1} + \nabla L$$
$$\theta_t = \theta_{t-1} - \alpha \cdot v_t$$

In [None]:
class SGDMomentum:
    """
    Momentum SGD 優化器
    """
    
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.momentum = momentum
        self.velocities = {}  # 儲存每個參數的動量
    
    def update(self, params_and_grads):
        """
        更新參數
        
        Parameters
        ----------
        params_and_grads : list of (param, grad) tuples
        """
        for i, (param, grad) in enumerate(params_and_grads):
            # 初始化動量
            if i not in self.velocities:
                self.velocities[i] = np.zeros_like(param)
            
            # 更新動量
            # 解答：v = β * v + grad
            self.velocities[i] = self.momentum * self.velocities[i] + grad
            
            # 更新參數
            # 解答：param -= lr * v
            param -= self.lr * self.velocities[i]

def train_with_momentum(model, X_train, y_train, X_val, y_val,
                        epochs=50, batch_size=32, learning_rate=0.01, momentum=0.9):
    """
    使用 Momentum SGD 訓練
    """
    optimizer = SGDMomentum(learning_rate, momentum)
    n_samples = len(y_train)
    history = {'train_loss': [], 'val_acc': []}
    
    for epoch in range(epochs):
        perm = np.random.permutation(n_samples)
        X_shuffled = X_train[perm]
        y_shuffled = y_train[perm]
        
        epoch_loss = 0
        n_batches = 0
        
        for i in range(0, n_samples, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            
            loss = model.loss(X_batch, y_batch)
            epoch_loss += loss
            n_batches += 1
            
            model.backward()
            optimizer.update(model.get_params_and_grads())
        
        train_loss = epoch_loss / n_batches
        val_pred = model.predict(X_val)
        val_acc = np.mean(val_pred == y_val)
        
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        
        if epoch % 10 == 0 or epoch == epochs - 1:
            print(f"Epoch {epoch:3d}: Loss={train_loss:.4f}, Val Acc={val_acc:.4f}")
    
    return history

# 訓練比較
print("=== 使用 Momentum SGD 訓練 ===")
np.random.seed(42)
model_momentum = LeNet(input_shape=(1, 28, 28), num_classes=4)
history_momentum = train_with_momentum(
    model_momentum, X_train, y_train, X_val, y_val,
    epochs=50, batch_size=16, learning_rate=0.01, momentum=0.9
)

In [None]:
# 比較 SGD vs Momentum SGD
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

ax = axes[0]
ax.plot(history['train_loss'], label='SGD')
ax.plot(history_momentum['train_loss'], label='Momentum SGD')
ax.set_xlabel('Epoch')
ax.set_ylabel('Training Loss')
ax.set_title('Loss Comparison')
ax.legend()
ax.grid(True, alpha=0.3)

ax = axes[1]
ax.plot(history['val_acc'], label='SGD')
ax.plot(history_momentum['val_acc'], label='Momentum SGD')
ax.set_xlabel('Epoch')
ax.set_ylabel('Validation Accuracy')
ax.set_title('Accuracy Comparison')
ax.set_ylim(0, 1.05)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 練習 2：計算混淆矩陣

In [None]:
def compute_confusion_matrix(y_true, y_pred, num_classes):
    """
    計算混淆矩陣
    """
    cm = np.zeros((num_classes, num_classes), dtype=int)
    for t, p in zip(y_true, y_pred):
        cm[t, p] += 1
    return cm

# 預測
y_pred = model.predict(X_val)

# 混淆矩陣
cm = compute_confusion_matrix(y_val, y_pred, num_classes=4)

# 視覺化
fig, ax = plt.subplots(figsize=(6, 6))
im = ax.imshow(cm, cmap='Blues')

# 標籤
ax.set_xticks(range(4))
ax.set_yticks(range(4))
ax.set_xticklabels(class_names)
ax.set_yticklabels(class_names)
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

# 數值標註
for i in range(4):
    for j in range(4):
        ax.text(j, i, str(cm[i, j]), ha='center', va='center',
                color='white' if cm[i, j] > cm.max()/2 else 'black')

ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')

plt.tight_layout()
plt.show()

# 每類的準確率
print("\n每類別的準確率:")
for i in range(4):
    class_acc = cm[i, i] / np.sum(cm[i, :])
    print(f"  {class_names[i]}: {class_acc:.4f}")

## 總結

在這個 notebook 中，我們成功地：

1. **組裝了完整的 LeNet CNN**：
   - 2 個卷積層 + ReLU + MaxPool
   - 3 個全連接層
   - Softmax + Cross-Entropy 損失

2. **實作了完整的訓練流程**：
   - Mini-batch 訓練
   - 前向傳播 → 計算損失 → 反向傳播 → 更新參數
   - 訓練/驗證指標追蹤

3. **驗證了網路能夠學習**：
   - Loss 下降
   - Accuracy 上升
   - 學到了有意義的特徵

4. **實作了優化技巧**：
   - Momentum SGD

### LeNet 架構總結

```
Input (1, 28, 28)
    ↓ Conv(6, 5x5, p=2)
(6, 28, 28)
    ↓ ReLU → MaxPool(2)
(6, 14, 14)
    ↓ Conv(16, 5x5)
(16, 10, 10)
    ↓ ReLU → MaxPool(2)
(16, 5, 5)
    ↓ Flatten
(400,)
    ↓ FC(120) → ReLU
(120,)
    ↓ FC(84) → ReLU
(84,)
    ↓ FC(num_classes)
(num_classes,)
```

### 下一步

接下來我們將學習更多優化技巧，包括：
- Weight Initialization 的重要性
- Learning Rate Schedule
- 更進階的優化器（Adam 等）