### 實驗 Ex1-42 Adam Optimizer

In [10]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
def run_ex1_42_full(initial_lr, iterations = 600, lambda_reg = 0.001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, master_seed = 42, verbose = True, print_every = 50):
    np.random.seed(master_seed)
    n_samples = 500
    area = np.random.uniform(10, 50, n_samples)
    expected_bedrooms = np.clip((area/15), 0, 4)
    bedrooms = np.random.normal(expected_bedrooms, 0.5)
    bedrooms = np.round(bedrooms).astype(int)
    bedrooms = np.clip(bedrooms, 0, 4)
    age = np.random.uniform(0, 30, n_samples)
    noise = np.random.normal(0, 2, n_samples)
    rent = 1.5 * area + 2 * bedrooms - 0.8 * age + 5 + noise

    X = np.column_stack((area, bedrooms, age))
    y = rent

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = master_seed)
    scaler = StandardScaler()
    X_train_standardized = scaler.fit_transform(X_train)
    X_test_standardized = scaler.transform(X_test)

    mean = scaler.mean_
    scale = scaler.scale_
    print(f'mean = ', np.round(mean, 4))
    print(f'scale = ', np.round(scale, 4))

    w = np.random.rand(X_train_standardized.shape[1])
    b = np.random.rand()

    m_w = np.zeros_like(w)
    m_b = 0
    v_w = np.zeros_like(w)
    v_b = 0

    lr_history = []
    loss_history_train = []
    loss_history_test = []
    w_history = []
    b_history = []

    for i in range(iterations):
        idx = np.random.randint(0, len(X_train_standardized))
        x_i = X_train_standardized[idx]
        y_i = y_train[idx]

        y_pred_i = np.dot(x_i , w) + b
        error_i = y_i - y_pred_i

        w_gradient = (-2) * x_i * error_i + 2 * lambda_reg * w
        b_gradient = (-2) * error_i

        m_w = beta1 * m_w + (1 - beta1) * w_gradient
        m_b = beta1 * m_b + (1 - beta1) * b_gradient
        v_w = beta2 * v_w + (1 - beta2) * (w_gradient ** 2)
        v_b = beta2 * v_b + (1 - beta2) * (b_gradient ** 2)

        m_w_hat = m_w / (1 - beta1 ** (i+1))
        m_b_hat = m_b / (1 - beta1 ** (i+1))
        v_w_hat = v_w / (1 - beta2 ** (i+1))
        v_b_hat = v_b / (1 - beta2 ** (i+1))

        w -= initial_lr * m_w_hat / (np.sqrt(v_w_hat) + epsilon)
        b -= initial_lr * m_b_hat / (np.sqrt(v_b_hat) + epsilon)

        w_history.append(w.copy())
        b_history.append(b)
        lr_history.append(initial_lr)

        y_pred_train = np.dot(X_train_standardized, w) + b
        error_train = y_train - y_pred_train
        loss_train = np.mean(error_train ** 2) + lambda_reg * np.sum(w ** 2)
        loss_history_train.append(loss_train)

        y_pred_test = np.dot(X_test_standardized, w) + b
        error_test = y_test - y_pred_test
        loss_test = np.mean(error_test ** 2)
        loss_history_test.append(loss_test)

        if verbose and (i % print_every == 0 or i == iterations-1):
            print(f'Iteration {i} : w = {np.round(w, 4)}, b = {b:.4f}, Train Loss = {loss_train:.4f}, Test Loss = {loss_test:.4f}')

    loss_history_test_arr = np.array(loss_history_test)
    min_index = np.argmin(loss_history_test_arr)
    min_test_loss = loss_history_test_arr[min_index]
    best_w = w_history[min_index]
    best_b = b_history[min_index]
    lr_at_best = lr_history[min_index]

    df_summary = pd.DataFrame({
        'Final Train Loss' : [np.round(loss_history_train[iterations-1], 4)],
        'Final Test Loss' : [np.round(loss_history_test[iterations-1], 4)],
        'Best Iteration' : [min_index],
        'Train Loss @ Best Test' : [np.round(loss_history_train[min_index], 4)],
        'Best Test Loss' : [np.round(min_test_loss, 4)],
        'w (params)' : [np.round(best_w, 4)],
        'b (bias)' : [np.round(best_b, 4)]
    })

    return min_test_loss, min_index, best_w, best_b, df_summary, loss_history_train, loss_history_test, lr_history


min_test_loss, min_index, best_w, best_b, df_summary, loss_history_train, loss_history_test, lr_history = run_ex1_42_full(initial_lr = 0.73, iterations = 600, master_seed = 42, verbose = True)

display(df_summary)

def lr_grid_search_ex1_42(lr_start = 0.2, lr_end = 8, lr_step = 0.01, iterations = 600, master_seed = 42):
    results = []
    best_lr = None
    best_loss = np.inf

    lr_list = np.arange(lr_start, lr_end + lr_step, lr_step)

    for lr in lr_list:
        print(f'Testing LR = {lr:.4f}')

        min_test_loss, min_index, best_w, best_b, df_summary, loss_history_train, loss_history_test, lr_history = run_ex1_42_full(initial_lr = lr, iterations = iterations, master_seed = master_seed, verbose = False)
        results.append({'lr' : lr, 'best_test_loss' : np.round(min_test_loss, 4)})

        if min_test_loss < best_loss:
            best_loss = min_test_loss
            best_lr = lr
    
    df = pd.DataFrame(results).sort_values('best_test_loss').reset_index(drop = True)
    return best_lr, df



mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Iteration 0 : w = [0.87   1.0068 1.7015], b = 1.0613, Train Loss = 2120.4871, Test Loss = 1769.7607
Iteration 50 : w = [11.3105  8.7845 -5.5953], b = 30.0086, Train Loss = 179.3534, Test Loss = 160.7741
Iteration 100 : w = [13.0002  7.5654 -7.1802], b = 41.2160, Train Loss = 14.9906, Test Loss = 16.1332
Iteration 150 : w = [13.4771  5.9705 -6.3813], b = 42.8149, Train Loss = 9.8503, Test Loss = 13.0272
Iteration 200 : w = [14.2082  5.0617 -5.8116], b = 42.4677, Train Loss = 8.8453, Test Loss = 11.9170
Iteration 250 : w = [16.0333  3.6931 -7.2388], b = 42.4191, Train Loss = 5.1114, Test Loss = 7.0648
Iteration 300 : w = [17.1564  3.1632 -6.7648], b = 42.6589, Train Loss = 4.1400, Test Loss = 5.8928
Iteration 350 : w = [17.4108  3.5625 -6.4898], b = 43.1365, Train Loss = 5.2103, Test Loss = 7.0357
Iteration 400 : w = [17.6227  2.7978 -6.8836], b = 43.6341, Train Loss = 4.9966, Test Loss = 6.8519
Iteration 450 : w = [17.

Unnamed: 0,Final Train Loss,Final Test Loss,Best Iteration,Train Loss @ Best Test,Best Test Loss,w (params),b (bias)
0,4.2155,6.2834,466,3.9742,5.4963,"[18.0809, 2.2462, -6.9819]",42.5888


In [12]:
best_lr, df_lr = lr_grid_search_ex1_42(lr_start = 0.2, lr_end = 8, lr_step = 0.01, iterations = 600, master_seed = 42)
print('Grid Search 找到最佳 LR = ', np.round(best_lr, 4))
display(df_lr.head(10))

Testing LR = 0.2000
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2100
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2200
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2300
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2400
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2500
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2600
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2700
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2800
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.2900
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.3000
mean =  [30.1526  1.99   14.6872]
scale =  [11.9856  1.0246  8.5246]
Testing LR = 0.3100
m

Unnamed: 0,lr,best_test_loss
0,0.73,5.4963
1,0.72,5.4965
2,0.74,5.4968
3,0.71,5.4971
4,0.75,5.4975
5,0.7,5.4981
6,0.76,5.4983
7,0.77,5.4997
8,0.69,5.4999
9,0.78,5.5005
