In [None]:
import numpy as np

## Gradient Descent

### Epoch=1:

- ${dw_1 = 0.2w_1 = 0.2 * (-5) = -1}$
- ${dw_2 = 4w_2 = 4 * (-2) = -8}$
- ${w_1 = w_1 - α * dw_1 = -5 - 0.4*(-1) = -4.6}$
- ${w_2 = w_2 - α*dw_2 = -2 - 0.4*(-8) = 1.2}$

### Epoch=2:
- ${dw_1 = 0.2w_1 = 0.2*(-4.6) = -0.92}$
- ${dw_2 = 4w_2 = 4*1.2 = 4.8}$
- ${w_1 = w_1 - α*dw_1 = -4.6 - 0.4 * (-0.92) = -4.232}$
- ${w_2 = w_2 - α*dw_2 = 1.2 - 0.4*4.8 = -0.72}$

In [None]:
def df_w(W):
  """
  Thực hiện tính gradient của dw1 và dw2
  Arguments:
  W -- np.array [w1, w2]
  Returns:
  dW -- np.array [dw1, dw2], array chứa giá trị đạo hàm theo w1 và w2
  """
  w1, w2 = W
  dw1 = 0.2 * w1
  dw2 = 4 * w2
  dW = np.array([dw1, dw2])
  return dW

In [None]:
def sgd(W, dW, lr):
  """
  Thực hiện thuật toán Gradient Descent để update w1 và w2
  Arguments:
  W -- np.array [w1, w2]
  dW -- np.array [dw1, dw2], array chứa giá trị đạo hàm theo w1 và w2
  lr -- float: learning rate
  Returns:
  W -- np.array [w1, w2] w1 và w2 sau khi update
  """
  W = W - lr * dW
  return W

In [None]:
def train_p1(optimizer, lr, epochs):
  """
  Thực hiện tìm điểm minium của function (1) dựa vào thuật toán được
  truyền vào từ optimizer
  Arguments:
  optimize: function thực hiện thuật toán optimization cụ thể
  lr -- float: learning rate
  epochs -- int: số lượng lần (epoch) lặp để tìm minium
  Returns:
  results -- list: list các cặp điểm [w1, w2] sau mỗi epoch (mỗi lần cập nhật)
  """

  # initial point
  W = np.array([-5, -2], dtype=np.float32)
  # list of results
  results = [W]

  # Tạo vòng lặp theo số epochs
  # Tìm gradient dW gồm dw1 và dw2
  # dùng thuật toán optimization cập nhật w1, w2
  # append cặp [w1, w2] vào list results
  for i in range(epochs):
    dW = df_w(W)
    W = optimizer(W, dW, lr)
    results.append(W)
    print(f'Epoch {i + 1}: w1 = {W[0]}, w2 = {W[1]}')
  return results

In [None]:
train_sgd = train_p1(sgd, 0.4, 30)

Epoch 1: w1 = -4.6, w2 = 1.2000000000000002
Epoch 2: w1 = -4.231999999999999, w2 = -0.7200000000000002
Epoch 3: w1 = -3.893439999999999, w2 = 0.43200000000000016
Epoch 4: w1 = -3.5819647999999993, w2 = -0.2592000000000001
Epoch 5: w1 = -3.2954076159999994, w2 = 0.1555200000000001
Epoch 6: w1 = -3.0317750067199993, w2 = -0.09331200000000006
Epoch 7: w1 = -2.7892330061823993, w2 = 0.05598720000000004
Epoch 8: w1 = -2.5660943656878072, w2 = -0.03359232000000004
Epoch 9: w1 = -2.360806816432783, w2 = 0.020155392000000022
Epoch 10: w1 = -2.1719422711181604, w2 = -0.012093235200000017
Epoch 11: w1 = -1.9981868894287076, w2 = 0.007255941120000012
Epoch 12: w1 = -1.838331938274411, w2 = -0.0043535646720000085
Epoch 13: w1 = -1.691265383212458, w2 = 0.0026121388032000056
Epoch 14: w1 = -1.5559641525554613, w2 = -0.0015672832819200039
Epoch 15: w1 = -1.4314870203510244, w2 = 0.0009403699691520025
Epoch 16: w1 = -1.3169680587229424, w2 = -0.0005642219814912016
Epoch 17: w1 = -1.211610614025107, w

## Gradient Descent + Momentum

### epoch=1
- ${dw_1 = 0.2w_1 = 0.2*(-5) = -1}$
- ${dw_1 = 4w_2 = 4*(-2) = -8}$
- ${v_1 = β * v_1 + (1 - β)*dw_1 = 0.5*0 + (1-0.5)*(-1) = -0.5}$
- ${v_2 = β * v_2 + (1 - β)*dw_2 = 0.5*0 + (1-0.5)*(-8) = -4}$
- ${w_1 = w_1 - α*dw_1 = -5 - 0.6 * (-0.5) = -4.7}$
- ${w_2 = w_2 - α*dw_2 = -2 - 0.6 * (-4) = 0.4}$

### epoch=2
- ${dw_1 = 0.2w_1 = 0.2*(-4.7) = -0.94}$
- ${dw_1 = 4w_2 = 4*(-0.4) = -1.6}$
- ${v_1 = β * v_1 + (1 - β)*dw_1 = 0.5*(-0.5) + (1-0.5)*(-0.94) = -0.72}$
- ${v_2 = β * v_2 + (1 - β)*dw_2 = 0.5*(-4) + (1-0.5)*1.6 = -1.2}$
- ${w_1 = w_1 - α*dw_1 = -4.7 - 0.6 * (-0.72) = -4.268}$
- ${w_2 = w_2 - α*dw_2 = 0.4 - 0.6 * (-1.2) = 1.12}$

In [32]:
def df_w(W):
  w1, w2 = W
  dw1 = 0.2 * w1
  dw2 = 4 * w2
  dW = np.array([dw1, dw2])
  return dW

In [38]:
def sgd_momentum(W, dW, lr, beta, V):
  V = beta * V + (1 - beta) * dW
  W = W - lr * V
  return W, V

In [39]:
def train_p2(optimizer, lr, beta, epochs):
  W = np.array([-5, -2], dtype=np.float32)
  V = np.zeros_like(W)
  results = [W]
  for i in range(epochs):
      dW = df_w(W)
      W, V = optimizer(W, dW, lr, beta, V)
      results.append(W)
      print(f'Epoch {i + 1}: w1 = {W[0]}, w2 = {W[1]}')
  return results

In [40]:
train_sgd_momentum = train_p2(sgd_momentum, 0.6, 0.5, 30)

Epoch 1: w1 = -4.7, w2 = 0.3999999999999999
Epoch 2: w1 = -4.268, w2 = 1.12
Epoch 3: w1 = -3.7959199999999997, w2 = 0.13600000000000012
Epoch 4: w1 = -3.3321248, w2 = -0.5192
Epoch 5: w1 = -2.900299712, w2 = -0.22376000000000013
Epoch 6: w1 = -2.5103691852799996, w2 = 0.19247199999999992
Epoch 7: w1 = -2.1647817708031996, w2 = 0.16962160000000004
Epoch 8: w1 = -1.8621011573166075, w2 = -0.04534951999999995
Epoch 9: w1 = -1.599034781134315, w2 = -0.09841565599999999
Epoch 10: w1 = -1.3715595061751098, w2 = -0.0068499368000000255
Epoch 11: w1 = -1.1755282983250006, w2 = 0.04715284695999999
Epoch 12: w1 = -1.006980996500446, w2 = 0.01757082248800001
Epoch 13: w1 = -0.8622884857981419, w2 = -0.018305176733599993
Epoch 14: w1 = -0.7382049212991013, w2 = -0.01427696426408
Epoch 15: w1 = -0.6318708437716349, w2 = 0.004869499087575998
Epoch 16: w1 = -0.5407915543816036, w2 = 0.0085993318583128
Epoch 17: w1 = -0.4628044164236918, w2 = 0.00014505001370584102
Epoch 18: w1 = -0.39604258245931434, 

## RMSProp

### epoch=1
- ${dw_1 = 0.2w_1 = 0.2*(-0.5) = -1}$
- ${dw_2 = 4w_2 = 4*(-2) = -8}$
- ${s_1 = γ*s_1 + (1 -γ) * dw_1^2 = 0.9*0 + (1-0.9) * (-1)^2 = 0.1}$
- ${s_2 = γ*s_2 + (1 -γ) * dw_2^2 = 0.9*0 + (1-0.9) * (-8)^2 = 6.4}$
- ${w_1 = w_1 - α * \frac{dw_1}{\sqrt{s_1 + ϵ}} = -5 - 0.3 * \frac{-8}{\sqrt{0.1 + 10^-6}} = -4.051}$
- ${w_2 = w_2 - α * \frac{dw_2}{\sqrt{s_2 + ϵ}} = -2 - 0.3 * \frac{-8}{\sqrt{6.4 + 10^-6}} = -1.051}$

### epoch=2
- ${dw_1 = 0.2w_1 = 0.2*(-4.051) = -0.8102}$
- ${dw_2 = 4w_2 = 4*(-1.051) = -4.204}$
- ${s_1 = γ*s_1 + (1 -γ) * dw_1^2 = 0.9*0.1 + (1-0.9) * (-0.8102)^2 = 0.156}$
- ${s_2 = γ*s_2 + (1 -γ) * dw_2^2 = 0.9*6.4 + (1-0.9) * (-4.204)^2 = 7.527}$
- ${w_1 = w_1 - α * \frac{dw_1}{\sqrt{s_1 + ϵ}} = -4.051 - 0.3 * \frac{-0.8102}{\sqrt{0.156 + 10^-6}} = -3.436}$
- ${w_2 = w_2 - α * \frac{dw_2}{\sqrt{s_2 + ϵ}} = -1.051 - 0.3 * \frac{-4.204}{\sqrt{7.527 + 10^-6}} = -0.591}$

In [41]:
def df_w(W):
  w1, w2 = W
  dw1 = 0.2 * w1
  dw2 = 4 * w2
  dW = np.array([dw1, dw2])
  return dW

In [42]:
def RMSProp(W, dW, lr, S, gamma):
  epsilon = 1e-6
  S = gamma * S + (1 - gamma) * dW ** 2
  adapt_lr = lr / np.sqrt(S + epsilon)
  W = W - adapt_lr * dW
  return W, S

In [43]:
def train_p3(optimizer, lr, epoch):
  W = np.array([-5, -2], dtype=np.float32)
  S = np.array([0, 0], dtype=np.float32)
  results = [W]
  for i in range(epoch):
    dW = df_w(W)
    W, S = optimizer(W, dW, lr, S, 0.9)
    results.append(W)
    print(f'Epoch {i + 1}: w1 = {W[0]}, w2 = {W[1]}')
  return results

In [44]:
train_RMSProp = train_p3(RMSProp, 0.3, 30)

Epoch 1: w1 = -4.051321445330401, w2 = -1.0513167760653601
Epoch 2: w1 = -3.435197540710313, w2 = -0.59152342591607
Epoch 3: w1 = -2.9589369293489796, w2 = -0.32943940499816177
Epoch 4: w1 = -2.5654628900149308, w2 = -0.1775648185723558
Epoch 5: w1 = -2.22920552377513, w2 = -0.09163256127358084
Epoch 6: w1 = -1.9362675156207105, w2 = -0.044944986580951356
Epoch 7: w1 = -1.6781768574274967, w2 = -0.020814229601575286
Epoch 8: w1 = -1.4493498477990567, w2 = -0.009035585595074875
Epoch 9: w1 = -1.245881993508816, w2 = -0.003645905472988451
Epoch 10: w1 = -1.0649030085077547, w2 = -0.0013535098945501255
Epoch 11: w1 = -0.9042022597717997, w2 = -0.00045644443087383875
Epoch 12: w1 = -0.7619964948529878, w2 = -0.0001375629281105624
Epoch 13: w1 = -0.6367784991349715, w2 = -3.62601019486888e-05
Epoch 14: w1 = -0.5272152373016314, w2 = -8.113374556116922e-06
Epoch 15: w1 = -0.4320785049217716, w2 = -1.47473411837664e-06
Epoch 16: w1 = -0.3501985066951055, w2 = -2.0278399084030024e-07
Epoch 17:

## Adam

### epoch=1
- ${dw_1 = 0.2w_1 = 0.2 * (-5) = -1}$
- ${dw_2 = 4w_2 = 4 * (-2) = -8}$
- ${v_1 = β_1 * v_1 + (1 - β_1) * dw_1 = 0.9*0 + (1-0.9) * (-1) = -0.1}$
- ${v_2 = β_2 * v_2 + (1 - β_2) * dw_2 = 0.9*0 + (1-0.9) * (-8) = -0.8}$
- ${s_1 = β_1 * s_1 + (1 - β_2) * dw_1^2 = 0.999*0 + (1 - 0.999) * (-1)^2 = 0.001}$
- ${s_2 = β_2 * s_2 + (1 - β_2) * dw_2^2 = 0.999*0 + (1 - 0.999) * (-8)^2 = 0.064}$
- ${v_{corr1} = \frac{v_1}{1-β_1^t} = \frac{-0.1}{1-0.9^1} = -1}$
- ${v_{corr2} = \frac{v_2}{1-β_1^t} = \frac{-0.8}{1-0.9^1} = -8}$
- ${s_{corr1} = \frac{s_1}{1-β_1^t} = \frac{0.001}{1-0.999^1} = 1}$
- ${s_{corr2} = \frac{s_2}{1-β_2^t} = \frac{0.064}{1-0.999^1} = 64}$
- ${w_1 = w_1 - α * \frac{v_{corr1}}{\sqrt{s_{corr1} + ϵ}} = -5 - 0.2 * \frac{-1}{\sqrt{1} + 10^{-6}} = -4.8}$
- ${w_2 = w_2 - α * \frac{v_{corr2}}{\sqrt{s_{corr2} + ϵ}} = -2 - 0.2 * \frac{-8}{\sqrt{64} + 10^{-6}} = -1.8}$

### epoch=2
- ${dw_1 = 0.2w_1 = 0.2 * (-4.8) = -0.96}$
- ${dw_2 = 4w_2 = 4 * (-1.8) = -7.2}$
- ${v_1 = β_1 * v_1 + (1 - β_1) * dw_1 = 0.9*(-0.1) + (1-0.9) * (-0.86) = -0.186}$
- ${v_2 = β_2 * v_2 + (1 - β_2) * dw_2 = 0.9*(-0.8) + (1-0.9) * (-7.2) = -1.44}$
- ${s_1 = β_1 * s_1 + (1 - β_2) * dw_1^2 = 0.999*0.001 + (1 - 0.999) * (-0.96)^2 = 0.0019206}$
- ${s_2 = β_2 * s_2 + (1 - β_2) * dw_2^2 = 0.999*0.064 + (1 - 0.999) * (-7.2)^2 = 0.115776}$
- ${v_{corr1} = \frac{v_1}{1-β_1^t} = \frac{-0.186}{1-0.9^1} = -0.9789474}$
- ${v_{corr2} = \frac{v_2}{1-β_1^t} = \frac{-1.44}{1-0.9^1} = -7.5789474}$
- ${s_{corr1} = \frac{s_1}{1-β_1^t} = \frac{0.0019206}{1-0.999^2} = 0.9607804}$
- ${s_{corr2} = \frac{s_2}{1-β_2^t} = \frac{0.115776}{1-0.999^2} = 57.9169585}$
- ${w_1 = w_1 - α * \frac{v_{corr1}}{\sqrt{s_{corr1} + ϵ}} = -4.8 - 0.2 * \frac{-0.9789474}{\sqrt{0.9607804} + 10^{-6}} = -4.6002546}$
- ${w_2 = w_2 - α * \frac{v_{corr2}}{\sqrt{s_{corr2} + ϵ}} = -1.8 - 0.2 * \frac{-7.5789474}{\sqrt{57.9169585} + 10^{-6}} = -1.6008245}$

In [None]:
def df_w(W):
  w1, w2 = W
  dw1 = 0.2 * w1
  dw2 = 4 * w2
  dW = np.array([dw1, dw2])
  return dW

In [None]:
def adam(W, dW, lr, V, S, t, beta1=0.9, beta2=0.999):
  epsilon = 1e-6
  V = beta1 * V + (1 - beta1) * dW
  S = beta2 * S + (1 - beta2) * (dW ** 2)
  v_corr = V / (1 - beta1**t)
  s_corr = S / (1 - beta2**t)
  W = W - lr * (v_corr / (np.sqrt(s_corr) + epsilon))
  return W, V, S

In [None]:
def train_p4(optimizer, lr, epochs):
  W = np.array([-5, -2], dtype=np.float32)
  V = np.array([0, 0], dtype=np.float32)
  S = np.array([0, 0], dtype=np.float32)
  results = [W]
  for i in range(epochs):
    dW = df_w(W)
    W, V, S = optimizer(W, dW, lr, V, S, i+1)
    results.append(W)
    print(f'Epoch {i + 1}: w1 = {W[0]}, w2 = {W[1]}')
  return results

In [None]:
train_adam = train_p4(adam, 0.2, 30)

Epoch 1: w1 = -4.8000001999998, w2 = -1.8000000249999968
Epoch 2: w1 = -4.600254779434054, w2 = -1.6008245063697515
Epoch 3: w1 = -4.400948476628311, w2 = -1.4031726206945152
Epoch 4: w1 = -4.2022776366594705, w2 = -1.2078782223488431
Epoch 5: w1 = -4.004450327821214, w2 = -1.015927446346848
Epoch 6: w1 = -3.807686378997748, w2 = -0.8284730661322335
Epoch 7: w1 = -3.6122173226091405, w2 = -0.6468415893870743
Epoch 8: w1 = -3.4182862261081466, w2 = -0.4725276521059605
Epoch 9: w1 = -3.2261473934546006, w2 = -0.3071693439456018
Epoch 10: w1 = -3.036065916693978, w2 = -0.15249855183024877
Epoch 11: w1 = -2.848317056874701, w2 = -0.010263256257146358
Epoch 12: w1 = -2.663185433233414, w2 = 0.11787552325788148
Epoch 13: w1 = -2.4809640000598776, w2 = 0.23046161354014214
Epoch 14: w1 = -2.301952792136848, w2 = 0.32635870212860313
Epoch 15: w1 = -2.126457422346911, w2 = 0.404841946592144
Epoch 16: w1 = -1.9547873191379472, w2 = 0.4656496111781283
Epoch 17: w1 = -1.7872536971852042, w2 = 0.508