In [None]:
import numpy as np

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
np.random.seed(42)

### 使用 Batch Gradient Descent

In [None]:
n_samples = 500

area = np.random.uniform(10, 50, n_samples)
bedrooms = np.random.randint(1, 5, n_samples)
age = np.random.uniform(0, 30, n_samples)

noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))           # 延著欄位方向合併
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)
b = np.random.rand()

learning_rate = 0.003
iterations = 5000
lambda_reg = 0.01
batch_size = 64

loss_history_train = []
loss_history_test = []


In [None]:
for i in range(iterations):
    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train

    w_gradient = (-2/len(X_train_standardized)) * np.dot(X_train_standardized.T, error_train) + 2 * lambda_reg * w
    b_gradient = (-2/len(X_train_standardized)) * np.sum(error_train)

    w -= learning_rate * w_gradient
    b -= learning_rate * b_gradient

    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w**2)

    loss_history_train.append(loss_train)

    # 驗證集損失
    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test.append(loss_test)

    if i % 500 == 0:
        print(f"Iteration {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train:.4f}, Test_Loss = {loss_test:.4f}")

import seaborn as sns
sns.set(style = 'whitegrid')
plt.plot(loss_history_train, label = 'Train Loss', color = 'blue')
plt.plot(loss_history_test, label = 'Test Loss', color = 'red', linestyle = 'dashed')
plt.title('Loss vs Iterations', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss', fontsize = 12)
plt.grid(True)
plt.legend()
plt.show()

### 使用 Mini-Batch Gradient Descent

In [None]:
w = np.random.rand(3)
b = np.random.rand()

learning_rate = 0.003
iterations = 10000
lambda_reg = 0.01
batch_size = 128

loss_history_train2 = []
loss_history_test2 = []

for i in range(iterations):
    permutation = np.random.permutation(len(X_train_standardized))
    X_shuffled = X_train_standardized[permutation]
    y_shuffled = y_train[permutation]

    for start_idx in range(0, len(X_train_standardized), batch_size):
        end_idx = start_idx + batch_size
        X_batch = X_shuffled[start_idx:end_idx]
        y_batch = y_shuffled[start_idx:end_idx]

        y_pred = np.dot(X_batch ,w) + b
        error = y_batch - y_pred

        w_gradient = (-2/len(X_batch)) * np.dot(X_batch.T ,error) + 2 * lambda_reg * w
        b_gradient = (-2/len(X_batch)) * np.sum(error)

        w -= learning_rate * w_gradient
        b -= learning_rate * b_gradient

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train2 = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train2.append(loss_train2)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test2 = np.mean(error_test ** 2)
    loss_history_test2.append(loss_test2)

    if i % 500 == 0:
        print(f"iteration {i}: w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train2:.4f}, Test_Loss = {loss_test2:.4f}")

import seaborn as sns
sns.set(style = 'whitegrid')
plt.plot(loss_history_train2, label = 'Train Loss', color = 'blue')
plt.plot(loss_history_test2, label = 'Test Loss', color = 'red', linestyle = 'dashed')
plt.title('Loss vs Iterations', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss', fontsize = 12)
plt.grid(True)
plt.legend()
plt.show()


### 使用三階段策略
<li>初期 : Mini-Batch + 固定學習率</li>
<li>中期 : Mini-Batch + 學習率衰退</li>
<li>後期 : Batch + 繼續衰退學習率</li>

In [None]:
w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
iterations = 10000
lambda_reg = 0.001
batch_size = 256

early_stage = 3000
middle_stage = 7000
decay_rate = 0.005


loss_history_train3 = []
loss_history_test3 = []

for i in range(iterations):
    # 決定學習率與使用 Batch GD 或 Mini-Batch GD
    if i < 3000:
        learning_rate = initial_lr
        use_batch = False
    elif i < 7000:
        learning_rate = initial_lr / (1 + decay_rate * (i - early_stage))
        use_batch = False
    else:
        learning_rate = initial_lr / (1 + decay_rate * (middle_stage - early_stage))
        use_batch = True

    # 資料抽樣
    if use_batch:
        X_batch = X_train_standardized
        y_batch = y_train
    else:
        permutation = np.random.choice(len(X_train_standardized), size = batch_size)
        X_batch = X_train_standardized[permutation]
        y_batch = y_train[permutation]
    
    # 前向預測
    y_pred = np.dot(X_batch, w) + b
    error = y_batch - y_pred

    w_gradient = (-2/len(X_batch)) * np.dot(X_batch.T, error) + 2 * lambda_reg * w
    b_gradient = (-2/len(X_batch)) * np.sum(error)

    w -= learning_rate * w_gradient
    b -= learning_rate * b_gradient

    loss_train3 = np.mean(error ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train3.append(loss_train3)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test3 = np.mean(error_test ** 2)
    loss_history_test3.append(loss_test3)

    if i % 500 == 0 or i == (iterations-1):
        print(f"iterations {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train3:.4f}, Test_Loss = {loss_test3:.4f}")


sns.set(style = 'whitegrid')
plt.plot(loss_history_train3, label = 'Train Loss', color = 'blue')
plt.plot(loss_history_test3, label = 'Test Loss', color = 'red', linestyle = 'dashed')
plt.title('Loss vs Iterations', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss', fontsize = 12)
plt.grid(True)
plt.legend()
plt.show()






In [None]:
plt.plot(np.array(loss_history_train3) - np.array(loss_history_test3), color = 'purple')
plt.title('Train Loss - Test Loss', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss Gap', fontsize = 12)
plt.grid(True)
plt.show()

### 使用 sklearn.linear_model.Ridge 套件

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


ridge_model = Ridge(alpha = 0.001, fit_intercept = True)
ridge_model.fit(X_train_standardized, y_train)

# 取得參數
w_ridge = ridge_model.coef_
b_ridge = ridge_model.intercept_

# 預測並計算 Loss
y_train_pred = ridge_model.predict(X_train_standardized)
y_test_pred = ridge_model.predict(X_test_standardized)

train_loss_ridge = mean_squared_error(y_train, y_train_pred)
test_loss_ridge = mean_squared_error(y_test, y_test_pred)

print(f"Ridge Regression 結果 : w = {np.round(w_ridge, 4)}, b = {b_ridge:.4f}, Train_Loss = {train_loss_ridge:.4f}, Test_Loss = {test_loss_ridge:.4f}")


### 實驗 Ex1-9 / Ex1-10 / Ex1-11  Stochastic Gradient Descent

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)
b = np.random.rand()

learning_rate = 0.001
iterations = 10000
lambda_reg = 0.001

lr_history = []
w_history = []
b_history = []
loss_history_train_ex1_9 = []
loss_history_test_ex1_9 = []

loss_history_train_ex1_10 = []
loss_history_test_ex1_10 = []

loss_history_train_ex1_11 = []
loss_history_test_ex1_11 = []

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = -2 * x_i * error_i + 2 * lambda_reg * w
    b_gradient = -2 * error_i

    lr_history.append(learning_rate)

    w -= learning_rate * w_gradient
    b -= learning_rate * b_gradient

    w_history.append(w.copy())
    b_history.append(b)

    y_train_pred = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_train_pred
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train_ex1_11.append(loss_train)

    y_test_pred = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_test_pred
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_11.append(loss_test)

    if i % 50 == 0 or i == (iterations-1):
        print(f'iteration {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')

# 找出最小 Test Loss 及對應的迭代次數
min_index = np.argmin(loss_history_test_ex1_11)
min_test_loss = loss_history_test_ex1_11[min_index]
best_w = w_history[min_index]
best_b = b_history[min_index]

summary_data = {
    'Final Train Loss' : [np.round(loss_history_train_ex1_11[iterations-1], 4)],
    'Final Test Loss' : [np.round(loss_history_test_ex1_11[iterations-1], 4)],
    'Best Iteration' : [min_index],
    'Train Loss @ Best Test' : [np.round(loss_history_train_ex1_11[min_index], 4)],
    'Best Test Loss' : [np.round(min_test_loss, 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)


plt.style.use('seaborn-v0_8-darkgrid')

fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'Learning Rate = {learning_rate}', ha = 'center', fontsize = 12, style = 'italic')


# 主軸
color1 = '#1f77b4'
color2 = '#ff7f0e'
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_11, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85)
ax1.plot(loss_history_test_ex1_11, label = 'Test Loss', color = color2, linestyle = 'dashed', linewidth = 3, alpha = 0.85)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index - 4000, min_test_loss + 300), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2), fontsize = 16, color = color3, bbox = bbox_props)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('Stochastic Gradient Descent : Loss vs Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4)

# 副軸
ax2 = ax1.twinx()
color3 = '#2ca02c'
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#plt.savefig('ex1_11.png', dpi = 720, bbox_inches = 'tight')
plt.show()


### 實驗 Ex1-12 / Ex1-13 / Ex1-14  SGD + Inverse Time Decay

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
iterations = 25000
lambda_reg = 0.001
decay_rate = 0.00005

w_history = []
b_history = []
loss_history_train_ex1_12 = []
loss_history_test_ex1_12 = []
lr_history_inverse_time = []

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    learning_rate = initial_lr / (1 + decay_rate * i)
    lr_history_inverse_time.append(learning_rate)

    w -= w_gradient * learning_rate
    b -= b_gradient * learning_rate

    w_history.append(w.copy())
    b_history.append(b)

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train_ex1_12.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_12.append(loss_test)

    if i % 50 == 0 or i == (iterations - 1):
        print(f'iteration {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')


# 找出最小 Test Loss 及對應的迭代次數
min_index = np.argmin(loss_history_test_ex1_12)
min_test_loss = loss_history_test_ex1_12[min_index]
best_w = w_history[min_index]
best_b = b_history[min_index]
lr_at_test_loss = lr_history_inverse_time[min_index]


summary_data = {
    'Final Train Loss' : [np.round(loss_history_train_ex1_12[iterations-1], 4)],
    'Final Test Loss' : [np.round(loss_history_test_ex1_12[iterations-1], 4)],
    'Best Iteration' : [min_index],
    'Train Loss @ Best Test' : [np.round(loss_history_train_ex1_12[min_index], 4)],
    'Best Test Loss' : [np.round(loss_history_test_ex1_12[min_index], 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)],
    'Learning Rate @ Test Loss' : [np.round(lr_at_test_loss, 4)]
}

df_summary = pd.DataFrame(summary_data)

display(df_summary)

# 設定圖案樣式
plt.style.use('seaborn-v0_8-darkgrid')

fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'SGD with decay rate = {decay_rate}, λ = 0.001', ha = 'center', fontsize = 12, style = 'italic')


# 主軸 : 畫 Loss 曲線
color1 = '#1f77b4'       # 藍
color2 = '#ff7f0e'       # 橘
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_12, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85, zorder = 3)
ax1.plot(loss_history_test_ex1_12, label = 'Test Loss', color = color2, linestyle = 'dashed', linewidth = 3, alpha = 0.85, zorder = 3)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3, zorder = 4)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index - 8000, min_test_loss + 700), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2, zorder = 4), fontsize = 16, color = color3, bbox = bbox_props, zorder = 4)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('SGD +  Inverse Time Decay : Loss & Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4, zorder = 0)

# 副軸 : 畫 Learning Rate 曲線
ax2 = ax1.twinx()
color3 = '#2ca02c'      # 綠
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history_inverse_time, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#plt.savefig('ex1_12.png', dpi = 720, bbox_inches = 'tight')
plt.show()

In [None]:
np.random.seed(42)
w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
# learning_rate = 0.001
iterations = 10000
lambda_reg = 0.001
decay_rate = 0.002

loss_history_train4 = []
loss_history_test4 = []

w_at_2800 = None
b_at_2800 = None

for i in range(iterations):
    # 隨機抽一筆資料
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    # 預測與誤差
    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    # 計算梯度
    w_gradient = -2 * x_i * error_i + 2 * lambda_reg * w
    b_gradient = -2 * error_i

    # 更新學習率(隨迭代次數遞減)
    learning_rate = initial_lr / (1 + decay_rate * i)

    # 更新參數
    w -= learning_rate * w_gradient
    b -= learning_rate * b_gradient

    # 每次記錄一次 Loss (可以改成每 10 次記錄一次)
    if i % 10 == 0:
        y_pred_train = np.dot(X_train_standardized, w) + b
        y_pred_test = np.dot(X_test_standardized, w) + b

        loss_train4 = np.mean((y_train - y_pred_train) ** 2) + lambda_reg * np.sum(w ** 2)
        loss_test4 = np.mean((y_test - y_pred_test) ** 2)

        loss_history_train4.append(loss_train4)
        loss_history_test4.append(loss_test4)

    # 每 500 次印出結果
    if i % 500 == 0 or i == (iterations-1):
        print(f"iteration {i} : w = {np.round(w, 4)}, b = {np.round(b, 4)}, Train_Loss = {loss_train4:.4f}, Test_Loss = {loss_test4:.4f}")

    if i == 2800:
        w_at_2800 = w.copy()
        b_at_2800 = b
        print(f"==> Saved model at iteration {i}: w = {w_at_2800}, b = {b_at_2800}")
    

sns.set(style = 'whitegrid')
plt.figure(figsize = (8, 6))
plt.plot(loss_history_train4, label = 'Train Loss', color = "blue")
plt.plot(loss_history_test4, label = 'Test Loss', color = 'red', linestyle = 'dashed')
plt.title('SGD: Loss vs Iterations', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss', fontsize = 12)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# np.savez('model_at_2800.npz', w = w_at_2800, b = b_at_2800)

### 實驗 Ex1-16 / Ex1-17 / Ex1-18  SGD + Moment + Inverse Decay

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)  
b = np.random.rand()

initial_lr = 0.001
iterations = 25000
decay_rate = 0.00005
lambda_reg = 0.001
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0

lr_history_inverse_time = []
w_history = []
b_history = []
loss_history_train_ex1_16 = []
loss_history_test_ex1_16 = []

loss_history_train_ex1_17 = []
loss_history_test_ex1_17 = []

loss_history_train_ex1_18 = []
loss_history_test_ex1_18 = []

print(X_train_standardized.shape)
print(y_train.shape)

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    learning_rate = initial_lr / (1 + decay_rate * i)
    lr_history_inverse_time.append(learning_rate)

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    w_history.append(w.copy())
    b_history.append(b)

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train_ex1_16.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_16.append(loss_test)

    if i % 50 == 0 or i == (iterations - 1):
        print(f'iterations {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')

# 找出最小 Test Loss 及對應的迭代次數
min_index = np.argmin(loss_history_test_ex1_16)
min_test_loss = loss_history_test_ex1_16[min_index]
best_w = w_history[min_index]
best_b = b_history[min_index]
lr_at_test_loss = lr_history_inverse_time[min_index]

summary_data = {
    'Final Train Loss' : [np.round(loss_history_train_ex1_16[iterations-1], 4)],
    'Final Test Loss' : [np.round(loss_history_test_ex1_16[iterations-1], 4)],
    'Best Iteration' : [min_index],
    'Train Loss @ Best Loss' : [np.round(loss_history_train_ex1_16[min_index], 4)],
    'Best Test Loss' : [np.round(loss_history_test_ex1_16[min_index], 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)],
    'Learning Rate @ Test Loss' : [np.round(lr_at_test_loss, 4)]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)


# 設定圖表樣式
plt.style.use('seaborn-v0_8-darkgrid')

fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'SGD with Momentum | decay rate = {decay_rate}, λ = {lambda_reg}, β= {beta}', ha = 'center', fontsize = 12, style = 'italic')

# 主軸 : 畫 Loss 曲線
color1 = '#1f77b4'       # 藍
color2 = '#ff7f0e'       # 橘
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_16, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85, zorder = 3)
ax1.plot(loss_history_test_ex1_16, label = 'Test Loss', color = color2, linestyle = 'dashed', linewidth = 3, alpha = 0.85, zorder = 3)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3, zorder = 4)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index - 8000, min_test_loss + 700), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2, zorder = 4), fontsize = 16, color = color3, bbox = bbox_props, zorder = 4)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('SGD + Momentum + Inverse Time Decay : Loss & Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4, zorder = 0)

# 副軸 : 畫 Learning Rate 曲線
ax2 = ax1.twinx()
color3 = '#2ca02c'      # 綠
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history_inverse_time, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#plt.savefig('ex1_16.png', dpi = 720, bbox_inches = 'tight')
plt.show()


### 實驗 Ex1-21 / Ex1-22 / Ex1-23 Exponential Decay

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

mean = scaler.mean_
scale = scaler.scale_
print('mean = ', np.round(mean, 4))
print('scale = ', np.round(scale, 4))

w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
iterations = 25000
lambda_reg = 0.001
decay_rate = 0.0001
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0

lr_history_exponential = []
loss_history_train_ex1_21 = []
loss_history_test_ex1_21 = []
w_history = []
b_history = []

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    learning_rate = initial_lr * math.exp(-decay_rate * i)
    lr_history_exponential.append(learning_rate)

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    w_history.append(w.copy())
    b_history.append(b)

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train_ex1_21.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_21.append(loss_test)

    if i % 50 == 0 or i == (iterations-1):
        print(f'iteration {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train:.4f}, Test_Loss = {loss_test:.4f}')

# 找出最小 Test Loss 及對應的迭代次數
min_index = np.argmin(loss_history_test_ex1_21)
min_test_loss = loss_history_test_ex1_21[min_index]
best_w = w_history[min_index]
best_b = b_history[min_index]
lr_at_test_loss = lr_history_exponential[min_index]

summary_data = {
    'Final Train Loss' : [np.round(loss_history_train_ex1_21[iterations-1], 4)],
    'Final Test Loss' : [np.round(loss_history_test_ex1_21[iterations-1], 4)],
    'Best Iteration' : [min_index],
    'Train Loss @ Best Test' : [np.round(loss_history_train_ex1_21[min_index], 4)],
    'Best Test Loss' : [np.round(min_test_loss, 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)], 
    'Learning Rate @ Best test' : [np.round(lr_at_test_loss, 4)]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)

# 設定圖表樣式
plt.style.use('seaborn-v0_8-darkgrid')

fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'SGD with Momentum | decay rate = {decay_rate}, λ = {lambda_reg}, β= {beta}', ha = 'center', fontsize = 12, style = 'italic')

# 主軸 : 畫 Loss 曲線
color1 = '#1f77bc'
color2 = '#ff7f0e'
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_21, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85, zorder = 3)
ax1.plot(loss_history_test_ex1_21, label = 'Test Loss', color = color2, linestyle = 'dashed', linewidth = 3, alpha = 0.85, zorder = 3)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3, zorder = 4)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index - 10000, min_test_loss + 500), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2, zorder = 4), fontsize = 16, color = color3, bbox = bbox_props, zorder = 4)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('SGD + Momentum + Exponential Decay : Loss & Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4, zorder = 0)

# 副軸 : 畫 Learning Rate 曲線
ax2 = ax1.twinx()
color3 = '#2ca02c'
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history_exponential, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#fig.savefig('ex1_22.png', dpi = 720, bbox_inches = 'tight')
fig.show()




### 實驗 Ex1-24 / Ex1-25 / Ex1-26 Step Decay

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.05
iterations = 25000
lambda_reg = 0.001
drop_rate = 0.5
step_size = 2500
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0

lr_history_step = []
loss_history_train_ex1_24 = []
loss_history_test_ex1_24 = []
w_history = []
b_history = []

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    v_w = beta * v_w + (1-beta) * w_gradient
    v_b = beta * v_b + (1-beta) * b_gradient

    learning_rate = initial_lr * (drop_rate ** (i//step_size))
    lr_history_step.append(learning_rate)

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    w_history.append(w.copy())
    b_history.append(b)

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w**2)
    loss_history_train_ex1_24.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_24.append(loss_test)

    if i % 50 == 0 or i == (iterations-1):
        print(f'iteration {i}: w = {np.round(w,4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')

min_index = np.argmin(loss_history_test_ex1_24)
min_test_loss = loss_history_test_ex1_24[min_index]   
best_w = w_history[min_index]
best_b = b_history[min_index]
lr_at_test_loss = lr_history_step[min_index]

summary_data = {
    'Final Train Loss' : [np.round(loss_history_train_ex1_24[iterations-1], 4)],
    'Final Test Loss' : [np.round(loss_history_test_ex1_24[iterations-1], 4)],
    'Best iteration' : [min_index],
    'Train Loss @ Best Test' : [np.round(loss_history_train_ex1_24[min_index],4)],
    'Best Test Loss' : [np.round(min_test_loss, 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)],
    'Learning Rate @ Best Test' : [np.round(lr_at_test_loss, 4)]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)

plt.style.use('seaborn-v0_8-darkgrid')
fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'SGD with Momentum | drop rate = {drop_rate}, step size = {step_size}, λ = {lambda_reg}, β= {beta}', ha = 'center', fontsize = 12, style = 'italic')

# 主軸 : 畫 Loss 曲線
color1 = '#1f77bc'
color2 = '#ff7f0e'
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_24, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85, zorder = 3)
ax1.plot(loss_history_test_ex1_24, label = 'Test Loss', color = color2, linestyle = 'dashed', linewidth = 3, alpha = 0.85, zorder = 3)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3, zorder = 4)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index + 4000, min_test_loss + 600), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2, zorder = 4), fontsize = 16, color = color3, bbox = bbox_props, zorder = 4)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('SGD + Momentum + Step Decay : Loss vs Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4, zorder = 0)

# 副軸 : 畫 Learning Rate 曲線
ax2 = ax1.twinx()
color3 = '#2ca02c'
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history_step, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#fig.savefig('ex1_24.png', dpi = 720, bbox_inches = 'tight')
fig.show()




### 實驗 Ex1-27 / Ex1-28 / Ex1-29 Polynomial Decay

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
iterations = 25000
lambda_reg = 0.001
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0
power = 2

lr_history_polynomial = []
loss_history_train_ex1_27 = []
loss_history_test_ex1_27 = []
w_history = []
b_history = []

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    learning_rate = initial_lr * (1 - (i / iterations)) ** power
    lr_history_polynomial.append(learning_rate)

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    w_history.append(w.copy())
    b_history.append(b)

    y_train_pred = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_train_pred
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train_ex1_27.append(loss_train)

    y_test_pred = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_test_pred
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_27.append(loss_test)

    if i % 5 == 0 or i == (iterations-1):
        print(f'iteration {i}: w = {np.round(w, 4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')

min_index = np.argmin(loss_history_test_ex1_27)
min_test_loss = loss_history_test_ex1_27[min_index]
best_w = w_history[min_index]
best_b = b_history[min_index]
lr_at_test_loss = lr_history_polynomial[min_index]

summary_data = {
    'Final Train' : [np.round(loss_history_train_ex1_27[iterations-1], 4)],
    'Final Test' : [np.round(loss_history_test_ex1_27[iterations-1], 4)],
    'Best iteration' : [min_index],
    'Train Loss @ Best Test' : [np.round(loss_history_train_ex1_27[min_index], 4)],
    'Best Test Loss' : [np.round(min_test_loss, 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)],
    'Learning Rate @ Best Test' : [np.round(lr_at_test_loss, 4)]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)

plt.style.use('seaborn-v0_8-darkgrid')
fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'SGD with Momentum | power = {power}, λ = {lambda_reg}, β= {beta}', ha = 'center', fontsize = 12, style = 'italic')

# 主軸
color1 = '#1f77bc'
color2 = '#ff7f0e'
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_27, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85, zorder = 3)
ax1.plot(loss_history_test_ex1_27, label = 'Test Loss', color = color2, linestyle = 'dashed', linewidth = 3, alpha = 0.85, zorder = 3)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3, zorder = 4)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index - 11000, min_test_loss + 600), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2, zorder = 4), fontsize = 16, color = color3, bbox = bbox_props, zorder = 4)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('SGD + Momentum + Polynomial Decay : Loss vs Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4, zorder = 0)

# 副軸 : 畫 Learning Rate 曲線
color3 = '#2ca02c'
ax2 = ax1.twinx()
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history_polynomial, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#fig.savefig('ex1_29.png', dpi = 720, bbox_inches = 'tight')
fig.show()



### 實驗 Ex1-30 / Ex1-31 / Ex1-32 SGD + Momentum + Cosine Annealing

In [None]:
np.random.seed(42)
n_samples = 500
area = np.random.uniform(10, 50, n_samples)
expected_bedrooms = np.clip((area/15), 0, 4)
bedrooms = np.random.normal(expected_bedrooms, 0.5)
bedrooms = np.round(bedrooms).astype(int)
bedrooms = np.clip(bedrooms, 0, 4)
age = np.random.uniform(0, 30, n_samples)
noise = np.random.normal(0, 2, n_samples)
rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

X = np.column_stack((area, bedrooms, age))
y = rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
iterations = 25000
lambda_reg = 0.001
min_lr = 0.0005
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0

lr_history_cosine_annealing = []
loss_history_train_ex1_30 = []
loss_history_test_ex1_30 = []
w_history = []
b_history = []

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    learning_rate = min_lr + (1/2) * (initial_lr - min_lr) * (1 + math.cos(i / iterations * math.pi))
    lr_history_cosine_annealing.append(learning_rate)

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    w_history.append(w.copy())
    b_history.append(b)

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train_ex1_30.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test_ex1_30.append(loss_test)

    if i % 50 == 0 or i == (iterations-1):
        print(f'iteration {i}: w = {np.round(w,4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')

min_index = np.argmin(loss_history_test_ex1_30)
min_test_loss = loss_history_test_ex1_30[min_index]
best_w = w_history[min_index]
best_b = b_history[min_index]
lr_at_test_loss = lr_history_cosine_annealing[min_index]

summary_data = {
    'Final Train Loss' : [np.round(loss_history_train_ex1_30[iterations-1], 4)],
    'Final Test Loss' : [np.round(loss_history_test_ex1_30[iterations-1], 4)],
    'Best iteration' : [min_index],
    'Train Loss @ Best Test' : [np.round(loss_history_train_ex1_30[min_index], 4)],
    'Best Test Loss' : [np.round(loss_history_test_ex1_30[min_index], 4)],
    'w (params)' : [np.round(best_w, 4)],
    'b (bias)' : [np.round(best_b, 4)],
    'Learning Rate @ Best Test' : [np.round(lr_at_test_loss, 4)]
}

df_summary = pd.DataFrame(summary_data)
display(df_summary)

plt.style.use('seaborn-v0_8-darkgrid')
fig, ax1 = plt.subplots(figsize = (8, 5.5))
fig.text(0.5, 0.893, f'SGD with Momentum | Max_lr = {initial_lr}, Min_lr = {min_lr}, λ = {lambda_reg}, β= {beta}', ha = 'center', fontsize = 12, style = 'italic')

# 主軸
color1 = '#1f77bc'
color2 = '#ff7f0e'
color3 = '#003366'
ax1.set_xlabel('Iterations', fontsize = 14, fontweight = 'bold')
ax1.set_ylabel('Loss', color = color1, fontsize = 14, fontweight = 'bold')
ax1.plot(loss_history_train_ex1_30, label = 'Train Loss', color = color1, linewidth = 3, alpha = 0.85, zorder = 3)
ax1.plot(loss_history_test_ex1_30, label = 'Test Loss', color = color2, linewidth = 3, linestyle = 'dashed', alpha = 0.85, zorder = 3)
ax1.tick_params(axis = 'y', labelcolor = color1)
ax1.plot(min_index, min_test_loss, 'o', markersize = 5, label = 'Min Test Loss', color = color3, zorder = 4)
bbox_props = dict(boxstyle = 'round,pad = 0.4', fc = 'white', lw = 0.8, alpha = 0.85)
ax1.annotate(f'Min: {min_test_loss:.4f} @ Iter {min_index}', xy = (min_index, min_test_loss), xytext = (min_index - 8000, min_test_loss + 1000), textcoords = 'data', arrowprops = dict(arrowstyle = '-|>', color = color3, lw = 1.2, zorder = 4), fontsize = 16, color = color3, bbox = bbox_props, zorder = 4)
ax1.legend(loc = 'upper right', fontsize = 12, framealpha = 0.9)
ax1.set_title('SGD + Momentum + Cosine Annealing Decay : Loss vs Learning Rate', fontsize = 16, fontweight = 'bold', pad = 24)
ax1.grid(True, linestyle = 'dashed', linewidth = 0.5, alpha = 0.4, zorder = 0)

# 副軸 : 畫 Learning Rate 曲線
color3 = '#2ca02c'
ax2 = ax1.twinx()
ax2.set_ylabel('Learning Rate', color = color3, fontsize = 14, fontweight = 'bold')
ax2.plot(lr_history_cosine_annealing, label = 'Learning Rate', color = color3, linestyle = 'dotted', linewidth = 4)
ax2.tick_params(axis = 'y', labelcolor = color3)

# 自動調整佈局
fig.tight_layout()
#fig.savefig('ex1_32.png', dpi = 720, bbox_inches = 'tight')
fig.show()

In [None]:
np.random.seed(42)
w = np.random.rand(3)        # 均勻分布, 浮點數, 範圍 [0,1)    
b = np.random.rand()

learning_rate = 0.001
iterations = 10000
lambda_reg = 0.001
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0

loss_history_train5 = []
loss_history_test5 = []

print(X_train_standardized.shape)
print(y_train.shape)

for i in range(iterations):
    idx = np.random.randint(0, len(X_train_standardized))
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) *  error_i

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train5.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test5.append(loss_test)

    if i % 500 == 0 or i == (iterations - 1):
        print(f'iterations {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train:.4f}, Test_Loss = {loss_test:.4f}')

sns.set(style = 'whitegrid')
plt.figure(figsize = (8, 6))
plt.plot(loss_history_train5, label = 'Train Loss', color = 'blue')
plt.plot(loss_history_test5, label = 'Test Loss', color = 'red', linestyle = 'dashed')
plt.title('SGD + Momentum Loss vs Iterations', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss', fontsize = 12)
plt.grid(True)
plt.legend()
plt.show()



In [None]:
np.random.seed(42)
w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.003
iterations = 10000
lambda_reg = 0.01
decay_rate = 0.001
beta = 0.9
v_w = np.zeros_like(w)
v_b = 0

loss_history_train6 = []
loss_history_test6 = []

for i in range(iterations):
    idx = np.random.randint(0, X_train_standardized.shape[0])
    x_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(x_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    learning_rate = initial_lr / (1 + decay_rate * i)   # Inverse Time Decay (反時間衰減)

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train
    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train6.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test6.append(loss_test)

    if i % 500 == 0 or i == (iterations-1):
        print(f'iterations {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train:.4f}, Test_Loss = {loss_test:.4f}')

sns.set(style = 'whitegrid')
plt.figure(figsize = (8, 6))
plt.plot(loss_history_train6, 
         
         label = 'Train Loss', color = 'blue')
plt.plot(loss_history_test6, label = 'Test Loss', color = 'red', linestyle = 'dashed')
plt.title('SGD + Momentum + Learning rate decay Loss vs Iterations', fontsize = 14)
plt.xlabel('Iterations', fontsize = 12)
plt.ylabel('Loss', fontsize = 12)
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import math
np.random.seed(42)

w = np.random.rand(3)
b = np.random.rand()

initial_lr = 0.001
iterations = 10000
decay_rate = 0.0001
lambda_reg = 0.001
beta = 0.9

# Cosine Annealing
# Cosine Annealing with Warm Restarts (SGDR)
eta_max = 0.001
eta_min = 1e-6
T_0 = 1000       # (初始週期長度 / SGDR)
T_i = T_0        # (當前週期長度 / SGDR)
T_start = 0      # (當前週期開始的 iteration / SGDR)

# Step Decay
drop_rate = 0.7
step_size = 1000

# Polynomial Decay
power = 1

v_w = np.zeros_like(w)
v_b = 0

loss_history_train7 = []
loss_history_test7 = []

lr_history = []

for i in range(iterations):
    idx = np.random.randint(0, X_train_standardized.shape[0])
    X_i = X_train_standardized[idx]
    y_i = y_train[idx]

    y_pred_i = np.dot(X_i, w) + b
    error_i = y_i - y_pred_i

    w_gradient = (-2) * X_i * error_i + 2 * lambda_reg * w
    b_gradient = (-2) * error_i

    # Decay 方法
    # learning_rate = initial_lr * (1 - i/iterations) ** power   # Polynomial Decay (多項式學習率衰退)
    # 當 power = 1 , 線性衰退(Linear Decay)
    # 當 power = 2 , 學習率會更快降低，尾端更平滑
    # 當 power < 1 ( 如 0.5 ) , 學習率初期下降慢、後期才加速下降

    # learning_rate = initial_lr / (1 + decay_rate * i)  # Inverse Time Decay
    # learning_rate = initial_lr * math.exp(-decay_rate * i)  # Exponential Decay
    # learning_rate = eta_min + 0.5 * (eta_max - eta_min) * (1 + math.cos(math.pi * i / iterations))      # Cosine Annealing 學習率公式
    # SGDR 更新週期 : 每當到達週期末時，就重設週期並倍增週期長度
    if i - T_start >= T_i:
        T_start = i
        T_i *= 2             # 每次週期倍增
    
    T_cur = i - T_start      # 當前週期中的相對位置
    learning_rate = eta_min + 0.5 * (eta_max - eta_min) * (1 + math.cos(math.pi * T_cur / T_i))

    # learning_rate = initial_lr * (drop_rate ** (i // step_size))    # Step Decay

    lr_history.append(learning_rate)

    v_w = beta * v_w + (1 - beta) * w_gradient
    v_b = beta * v_b + (1 - beta) * b_gradient

    w -= learning_rate * v_w
    b -= learning_rate * v_b

    y_pred_train = np.dot(X_train_standardized, w) + b
    error_train = y_train - y_pred_train

    loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
    loss_history_train7.append(loss_train)

    y_pred_test = np.dot(X_test_standardized, w) + b
    error_test = y_test - y_pred_test
    loss_test = np.mean(error_test ** 2)
    loss_history_test7.append(loss_test)

    if i % 50 == 0 or i == (iterations - 1):
        print(f'iterations {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train_Loss = {loss_train:.4f}, Test_Loss = {loss_test:.4f}')

fig, ax1 = plt.subplots(figsize = (10, 6))

# 主軸 : 畫 Loss 曲線
color = 'tab:blue'
ax1.set_xlabel('Iterations', fontsize = 12)
ax1.set_ylabel('Loss', color = color , fontsize = 12)
ax1.plot(loss_history_train7, label = 'Train Loss', color = 'blue')
ax1.plot(loss_history_test7, label = 'Test Loss', color = 'red', linestyle = 'dashed')
ax1.tick_params(axis = 'y', labelcolor = color)
ax1.legend(loc = 'upper right')
ax1.set_title('SGD + Momentum + SGDR: Loss vs Learning Rate', fontsize = 14)

# 副軸 : 畫 Learning Rate 曲線
ax2 = ax1.twinx()
color = 'tab:green'
ax2.set_ylabel('Learning Rate', color = color, fontsize = 12)
ax2.plot(lr_history, label = 'Learning Rate', color = color, linestyle = 'dotted')
ax2.tick_params(axis = 'y', labelcolor = color)

fig.tight_layout()
plt.grid(True)
plt.show()