# Module 0.4: 梯度下降實作

這個 notebook 會讓你完整實作梯度下降，並理解各種變體。

## 學習目標

1. 實作基本梯度下降
2. 理解學習率的影響
3. 實作 Momentum
4. 用梯度下降做線性迴歸

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

---
## Part 1: 基本梯度下降

### 算法

$$\mathbf{x}_{t+1} = \mathbf{x}_t - \eta \nabla f(\mathbf{x}_t)$$

- $\eta$：學習率 (learning rate)
- $\nabla f$：梯度（指向函數增長最快的方向）

**直覺**：不斷往「下坡」方向走一小步。

In [None]:
def gradient_descent(f, grad_f, x0, lr=0.1, n_iter=100, tol=1e-6):
    """
    基本梯度下降
    
    Parameters
    ----------
    f : callable
        目標函數
    grad_f : callable
        梯度函數
    x0 : np.ndarray
        初始點
    lr : float
        學習率
    n_iter : int
        最大迭代次數
    tol : float
        收斂容忍度（梯度小於此值時停止）
    
    Returns
    -------
    x : np.ndarray
        最終解
    history : dict
        包含 'x', 'f', 'grad_norm' 的歷史記錄
    """
    # 解答：
    x = x0.copy().astype(float)
    history = {
        'x': [x.copy()],
        'f': [f(x)],
        'grad_norm': []
    }
    
    for i in range(n_iter):
        grad = grad_f(x)
        grad_norm = np.linalg.norm(grad)
        history['grad_norm'].append(grad_norm)
        
        # 檢查收斂
        if grad_norm < tol:
            print(f"Converged at iteration {i}")
            break
        
        # 更新
        x = x - lr * grad
        
        history['x'].append(x.copy())
        history['f'].append(f(x))
    
    history['x'] = np.array(history['x'])
    history['f'] = np.array(history['f'])
    history['grad_norm'] = np.array(history['grad_norm'])
    
    return x, history

In [None]:
# 測試：最小化 f(x, y) = x² + y²
def f_simple(x):
    return x[0]**2 + x[1]**2

def grad_f_simple(x):
    return np.array([2*x[0], 2*x[1]])

x0 = np.array([5.0, 5.0])
x_opt, history = gradient_descent(f_simple, grad_f_simple, x0, lr=0.1, n_iter=50)

print(f"起點: {x0}")
print(f"終點: {x_opt}")
print(f"f(終點) = {f_simple(x_opt):.6f}")
print(f"迭代次數: {len(history['f']) - 1}")

In [None]:
# 視覺化收斂過程
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 1. 等高線圖和軌跡
ax = axes[0]
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2
ax.contour(X, Y, Z, levels=20, cmap='viridis')
ax.plot(history['x'][:, 0], history['x'][:, 1], 'ro-', markersize=4)
ax.plot(history['x'][0, 0], history['x'][0, 1], 'go', markersize=10, label='Start')
ax.plot(0, 0, 'b*', markersize=15, label='Minimum')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('梯度下降軌跡')
ax.legend()

# 2. Loss 曲線
ax = axes[1]
ax.plot(history['f'], 'b-', linewidth=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('f(x)')
ax.set_title('Loss 曲線')
ax.set_yscale('log')
ax.grid(True, alpha=0.3)

# 3. 梯度大小
ax = axes[2]
ax.plot(history['grad_norm'], 'r-', linewidth=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('||∇f||')
ax.set_title('梯度大小')
ax.set_yscale('log')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## Part 2: 學習率的影響

學習率太大或太小都有問題：

- **太小**：收斂太慢
- **太大**：震盪或發散
- **剛好**：快速收斂

In [None]:
# 比較不同學習率
learning_rates = [0.01, 0.1, 0.5, 0.95]
x0 = np.array([5.0, 5.0])

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

# 等高線
x = np.linspace(-7, 7, 100)
y = np.linspace(-7, 7, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2

for ax, lr in zip(axes, learning_rates):
    _, history = gradient_descent(f_simple, grad_f_simple, x0, lr=lr, n_iter=30)
    
    ax.contour(X, Y, Z, levels=20, cmap='viridis', alpha=0.5)
    ax.plot(history['x'][:, 0], history['x'][:, 1], 'ro-', markersize=4)
    ax.plot(history['x'][0, 0], history['x'][0, 1], 'go', markersize=10)
    ax.set_xlim(-7, 7)
    ax.set_ylim(-7, 7)
    ax.set_title(f'Learning Rate = {lr}\nFinal f = {history["f"][-1]:.4f}')
    ax.set_aspect('equal')

plt.tight_layout()
plt.show()

---
## Part 3: Momentum

加入「動量」可以：
1. 加速收斂（尤其在有窄谷的情況）
2. 幫助跳出局部最小值

### 算法

$$\mathbf{v}_{t+1} = \beta \mathbf{v}_t - \eta \nabla f(\mathbf{x}_t)$$
$$\mathbf{x}_{t+1} = \mathbf{x}_t + \mathbf{v}_{t+1}$$

- $\beta$：動量係數（通常 0.9）
- $\mathbf{v}$：速度

In [None]:
def gradient_descent_momentum(f, grad_f, x0, lr=0.1, momentum=0.9, n_iter=100):
    """
    帶 Momentum 的梯度下降
    """
    # 解答：
    x = x0.copy().astype(float)
    v = np.zeros_like(x)  # 初始速度為 0
    
    history = {
        'x': [x.copy()],
        'f': [f(x)]
    }
    
    for i in range(n_iter):
        grad = grad_f(x)
        
        # 更新速度
        v = momentum * v - lr * grad
        
        # 更新位置
        x = x + v
        
        history['x'].append(x.copy())
        history['f'].append(f(x))
    
    history['x'] = np.array(history['x'])
    history['f'] = np.array(history['f'])
    
    return x, history

In [None]:
# 在「窄谷」函數上比較 GD vs Momentum
# f(x, y) = x² + 10y² （y 方向很「陡」）

def f_valley(x):
    return x[0]**2 + 10*x[1]**2

def grad_f_valley(x):
    return np.array([2*x[0], 20*x[1]])

x0 = np.array([5.0, 2.0])

# 普通 GD
_, history_gd = gradient_descent(f_valley, grad_f_valley, x0, lr=0.05, n_iter=50)

# Momentum
_, history_mom = gradient_descent_momentum(f_valley, grad_f_valley, x0, lr=0.05, momentum=0.9, n_iter=50)

# 視覺化
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 等高線
x = np.linspace(-6, 6, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + 10*Y**2

ax = axes[0]
ax.contour(X, Y, Z, levels=30, cmap='viridis', alpha=0.5)
ax.plot(history_gd['x'][:, 0], history_gd['x'][:, 1], 'ro-', markersize=3, label='GD')
ax.plot(history_mom['x'][:, 0], history_mom['x'][:, 1], 'bs-', markersize=3, label='Momentum')
ax.plot(x0[0], x0[1], 'go', markersize=10)
ax.set_title('軌跡比較\n（窄谷函數 f = x² + 10y²）')
ax.legend()

ax = axes[1]
ax.plot(history_gd['f'], 'r-', label='GD')
ax.plot(history_mom['f'], 'b-', label='Momentum')
ax.set_xlabel('Iteration')
ax.set_ylabel('f(x)')
ax.set_title('Loss 曲線')
ax.set_yscale('log')
ax.legend()
ax.grid(True, alpha=0.3)

ax = axes[2]
ax.plot(history_gd['f'][:20], 'r-', label='GD')
ax.plot(history_mom['f'][:20], 'b-', label='Momentum')
ax.set_xlabel('Iteration')
ax.set_ylabel('f(x)')
ax.set_title('前 20 次迭代（線性尺度）')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"GD 50 次後: f = {history_gd['f'][-1]:.6f}")
print(f"Momentum 50 次後: f = {history_mom['f'][-1]:.6f}")

---
## Part 4: 用梯度下降做線性迴歸

把學到的梯度下降應用到實際問題！

### 線性迴歸

模型：$\hat{y} = \mathbf{w}^T \mathbf{x} + b$

Loss：$L = \frac{1}{N} \sum_{i=1}^N (\hat{y}_i - y_i)^2$ (MSE)

梯度：
- $\frac{\partial L}{\partial \mathbf{w}} = \frac{2}{N} X^T (X\mathbf{w} + b - \mathbf{y})$
- $\frac{\partial L}{\partial b} = \frac{2}{N} \sum_i (\hat{y}_i - y_i)$

In [None]:
class LinearRegressionGD:
    """
    用梯度下降訓練的線性迴歸
    """
    def __init__(self, lr=0.01, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter
        self.w = None
        self.b = None
        self.history = {'loss': []}
    
    def fit(self, X, y):
        """
        訓練模型
        
        Parameters
        ----------
        X : np.ndarray, shape (N, D)
            訓練資料
        y : np.ndarray, shape (N,)
            目標值
        """
        N, D = X.shape
        
        # 初始化參數
        self.w = np.zeros(D)
        self.b = 0.0
        
        # 解答：梯度下降
        for i in range(self.n_iter):
            # 預測
            y_pred = X @ self.w + self.b
            
            # 計算 loss
            loss = np.mean((y_pred - y) ** 2)
            self.history['loss'].append(loss)
            
            # 計算梯度
            error = y_pred - y  # shape: (N,)
            grad_w = (2 / N) * (X.T @ error)
            grad_b = (2 / N) * np.sum(error)
            
            # 更新參數
            self.w = self.w - self.lr * grad_w
            self.b = self.b - self.lr * grad_b
        
        return self
    
    def predict(self, X):
        return X @ self.w + self.b

In [None]:
# 生成模擬資料
np.random.seed(42)
N = 100
X = np.random.randn(N, 1) * 2
y_true = 3 * X[:, 0] + 2  # 真實關係：y = 3x + 2
y = y_true + np.random.randn(N) * 0.5  # 加入噪音

# 訓練
model = LinearRegressionGD(lr=0.1, n_iter=100)
model.fit(X, y)

print(f"真實參數: w = 3, b = 2")
print(f"學到的參數: w = {model.w[0]:.4f}, b = {model.b:.4f}")

In [None]:
# 視覺化
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# 1. 資料和擬合線
ax = axes[0]
ax.scatter(X, y, alpha=0.5, label='Data')
x_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_line = model.predict(x_line)
ax.plot(x_line, y_line, 'r-', linewidth=2, label=f'Fit: y = {model.w[0]:.2f}x + {model.b:.2f}')
ax.plot(x_line, 3*x_line + 2, 'g--', linewidth=2, label='True: y = 3x + 2')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('線性迴歸結果')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Loss 曲線
ax = axes[1]
ax.plot(model.history['loss'], 'b-', linewidth=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('MSE Loss')
ax.set_title('訓練過程')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## 練習題

### 練習 1：實作 Nesterov Momentum

Nesterov Accelerated Gradient (NAG) 是 Momentum 的改進版，先「向前看」再計算梯度：

$$\mathbf{v}_{t+1} = \beta \mathbf{v}_t - \eta \nabla f(\mathbf{x}_t + \beta \mathbf{v}_t)$$
$$\mathbf{x}_{t+1} = \mathbf{x}_t + \mathbf{v}_{t+1}$$

**任務**：實作 `gradient_descent_nesterov` 函數

In [None]:
# 練習 1 解答：Nesterov Momentum

def gradient_descent_nesterov(f, grad_f, x0, lr=0.1, momentum=0.9, n_iter=100):
    """
    Nesterov Accelerated Gradient
    
    與一般 Momentum 的差別：先用當前速度「預測」下一個位置，
    然後在那個位置計算梯度。這讓算法有「前瞻性」。
    """
    x = x0.copy().astype(float)
    v = np.zeros_like(x)
    
    history = {
        'x': [x.copy()],
        'f': [f(x)]
    }
    
    for i in range(n_iter):
        # 關鍵差異：在「預測位置」計算梯度
        x_lookahead = x + momentum * v
        grad = grad_f(x_lookahead)
        
        # 更新速度
        v = momentum * v - lr * grad
        
        # 更新位置
        x = x + v
        
        history['x'].append(x.copy())
        history['f'].append(f(x))
    
    history['x'] = np.array(history['x'])
    history['f'] = np.array(history['f'])
    
    return x, history

# 測試並比較三種方法
x0 = np.array([5.0, 2.0])

_, hist_gd = gradient_descent(f_valley, grad_f_valley, x0, lr=0.05, n_iter=50)
_, hist_mom = gradient_descent_momentum(f_valley, grad_f_valley, x0, lr=0.05, momentum=0.9, n_iter=50)
_, hist_nag = gradient_descent_nesterov(f_valley, grad_f_valley, x0, lr=0.05, momentum=0.9, n_iter=50)

plt.figure(figsize=(10, 4))
plt.plot(hist_gd['f'], 'r-', label='GD')
plt.plot(hist_mom['f'], 'b-', label='Momentum')
plt.plot(hist_nag['f'], 'g-', label='Nesterov')
plt.xlabel('Iteration')
plt.ylabel('f(x)')
plt.title('三種優化方法比較')
plt.legend()
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

print(f"GD:       f = {hist_gd['f'][-1]:.6f}")
print(f"Momentum: f = {hist_mom['f'][-1]:.6f}")
print(f"Nesterov: f = {hist_nag['f'][-1]:.6f}")

### 練習 2：實作 Learning Rate Decay

隨著訓練進行，逐漸減小學習率可以幫助收斂到更好的解。

常見的 decay 策略：
- Step decay: $\eta_t = \eta_0 \cdot \gamma^{\lfloor t/k \rfloor}$（每 k 步乘以 γ）
- Exponential decay: $\eta_t = \eta_0 \cdot e^{-\lambda t}$

**任務**：在梯度下降中加入 learning rate decay

In [None]:
# 練習 2 解答：Learning Rate Decay

def gradient_descent_with_decay(f, grad_f, x0, lr=0.1, n_iter=100, 
                                 decay_type='step', decay_rate=0.5, decay_steps=20):
    """
    帶學習率衰減的梯度下降
    
    Parameters
    ----------
    decay_type : str
        'step' 或 'exponential'
    decay_rate : float
        衰減率（step: gamma, exponential: lambda）
    decay_steps : int
        step decay 的步數間隔
    """
    x = x0.copy().astype(float)
    
    history = {
        'x': [x.copy()],
        'f': [f(x)],
        'lr': []
    }
    
    for i in range(n_iter):
        # 計算當前學習率
        if decay_type == 'step':
            current_lr = lr * (decay_rate ** (i // decay_steps))
        elif decay_type == 'exponential':
            current_lr = lr * np.exp(-decay_rate * i)
        else:
            current_lr = lr
        
        history['lr'].append(current_lr)
        
        grad = grad_f(x)
        x = x - current_lr * grad
        
        history['x'].append(x.copy())
        history['f'].append(f(x))
    
    history['x'] = np.array(history['x'])
    history['f'] = np.array(history['f'])
    history['lr'] = np.array(history['lr'])
    
    return x, history

# 測試
x0 = np.array([5.0, 5.0])

_, hist_const = gradient_descent(f_simple, grad_f_simple, x0, lr=0.3, n_iter=50)
_, hist_step = gradient_descent_with_decay(f_simple, grad_f_simple, x0, lr=0.3, n_iter=50, 
                                            decay_type='step', decay_rate=0.5, decay_steps=15)
_, hist_exp = gradient_descent_with_decay(f_simple, grad_f_simple, x0, lr=0.3, n_iter=50,
                                           decay_type='exponential', decay_rate=0.05)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

ax = axes[0]
ax.plot(hist_const['f'], 'r-', label='Constant LR')
ax.plot(hist_step['f'], 'b-', label='Step Decay')
ax.plot(hist_exp['f'], 'g-', label='Exponential Decay')
ax.set_xlabel('Iteration')
ax.set_ylabel('f(x)')
ax.set_title('Loss 曲線')
ax.legend()
ax.set_yscale('log')
ax.grid(True, alpha=0.3)

ax = axes[1]
ax.plot([0.3]*50, 'r-', label='Constant')
ax.plot(hist_step['lr'], 'b-', label='Step Decay')
ax.plot(hist_exp['lr'], 'g-', label='Exponential Decay')
ax.set_xlabel('Iteration')
ax.set_ylabel('Learning Rate')
ax.set_title('學習率變化')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()