<a href="https://colab.research.google.com/github/KangaOnGit/AIO-354-Homework/blob/develop/Optimizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
def initializer(input, theta):
  """
    Initialize parameters

    Args:
      input: Contains input features (np.array)
      theta: Contains parameters (np.array)

    Returns:
      target_function: Function to optimize
  """
  target_function = (input**2).T.dot(theta)
  return target_function

input = np.array([-5, -2])
theta = np.array([0.1, 2])
f = initializer(input, theta)
print(f)

10.5


In [None]:
def df_w(W):
  """
    Compute gradient of target function with respect w_1 and w_2

    Args:
      W: Contains w_1 and w_2 (np.array)

    Returns:
      dW: Contains the gradient of w_1 and w_2 (np.array)
  """
  df_dw1 = 0.2 * W[0]
  df_dw2 = 4 * W[1]

  dW = np.array([df_dw1, df_dw2])
  return dW

dW = df_w(input)
print(dW)

[-1. -8.]


# **1. Gradient Descent**

---



In [None]:
def gd(W, dW, lr):
  """
    Stochastic Gradient Descent to update w_1 and w_2 of array W

    Args:
      W: Contains paramsw_1 and w_2 (np.array)
      dW: Contains the gradient of w_1 and w_2 (np.array)
      lr: learning rate (float)

    Returns:
      W: Contains updated w_1 and w_2 (np.array)
  """
  W = W - lr*dW
  return W

W = gd(input, dW, lr = 0.4)

print(input)
print(W)

[-5 -2]
[-4.6  1.2]


In [None]:
def train_gd(W, optim, lr, epochs):
  """
    Train the target function

    Args:
      W: Contains w_1 and w_2 (np.array)
      lr: Learning rate (float)
      epochs: Number of times trained (int)

    Returns:
      results: Contains updated w_1 and w_2 of each iteration (list)
  """
  results = [W]

  for i in range(epochs):
    dW = df_w(W)
    W = optim(W, dW, lr)
    results.append(W)

  return results

results = train_gd(input, gd, lr = 0.4, epochs = 30)
results2 = train_gd(input, gd, lr = 0.4, epochs = 2)
print(results)
print(results2)

[array([-5, -2]), array([-4.6,  1.2]), array([-4.232, -0.72 ]), array([-3.89344,  0.432  ]), array([-3.5819648, -0.2592   ]), array([-3.29540762,  0.15552   ]), array([-3.03177501, -0.093312  ]), array([-2.78923301,  0.0559872 ]), array([-2.56609437, -0.03359232]), array([-2.36080682,  0.02015539]), array([-2.17194227, -0.01209324]), array([-1.99818689,  0.00725594]), array([-1.83833194, -0.00435356]), array([-1.69126538,  0.00261214]), array([-1.55596415, -0.00156728]), array([-1.43148702e+00,  9.40369969e-04]), array([-1.31696806e+00, -5.64221981e-04]), array([-1.21161061e+00,  3.38533189e-04]), array([-1.11468176e+00, -2.03119913e-04]), array([-1.02550722e+00,  1.21871948e-04]), array([-9.43466646e-01, -7.31231688e-05]), array([-8.67989314e-01,  4.38739013e-05]), array([-7.98550169e-01, -2.63243408e-05]), array([-7.34666155e-01,  1.57946045e-05]), array([-6.75892863e-01, -9.47676268e-06]), array([-6.21821434e-01,  5.68605761e-06]), array([-5.72075719e-01, -3.41163456e-06]), array([-

# **2. Stochastic Gradient Descent**

---



In [None]:
def compute_momentum(beta, V, dW):
  """
    Compute momentum for Stochastic Gradient Descent

    Args:
      beta: Hyper parameter for regularization (float)
      dW: Contains the gradient of w_1 and w_2 (np.array)

    Returns:
      V_t: Contains updated velocity (np.array)
  """
  V = beta*V + (1 - beta)*dW
  return V

V = np.array([0, 0])
momentum = compute_momentum(beta = 0.5, V = V, dW = dW)
print(momentum)

[-0.5 -4. ]


In [None]:
def sgd(W, dW, lr, beta, V):
  """
    Stochastic Gradient Descent to update weight

    Arg:
      W: Contains w_1 and w_2 (np.array)
      lr: Learning rate (float)

    Returns:
      W: Contains updated w_1 and w_2 (np.array)
  """
  momentum = compute_momentum(beta = beta, dW = dW, V= V)
  V = momentum
  W = W - lr*V
  return W, V

W, V_1 = sgd(input, dW, lr = 0.6, beta = 0.5, V = V)
print(W, V_1)

[-4.7  0.4] [-0.5 -4. ]


In [None]:
def train_sgd(W, dW, lr, beta, V, optim, epochs):
  """
    Train the target function

    Args:
      W: Contains w_1 and w_2 (np.array)
      lr: Learning rate (float)
      epochs: Number of times trained (int)
      beta: Hyper parameter for regularization (float)

    Returns:
      results: Contains updated w_1 and w_2 of each iteration (list)
  """
  results = [W]

  for i in range(epochs):
    dW = df_w(W)
    W, V_1 = optim(W = W, dW = dW, lr = lr, beta = beta, V = V)
    V = V_1
    results.append(W)

  return results

results = train_sgd(input, dW, lr = 0.6, beta = 0.5, V = V, optim = sgd, epochs = 30)
results2 = train_sgd(input, dW, lr = 0.6, beta = 0.5, V = V, optim = sgd, epochs = 2)
print(results)
print(results2)

[array([-5, -2]), array([-4.7,  0.4]), array([-4.268,  1.12 ]), array([-3.79592,  0.136  ]), array([-3.3321248, -0.5192   ]), array([-2.90029971, -0.22376   ]), array([-2.51036919,  0.192472  ]), array([-2.16478177,  0.1696216 ]), array([-1.86210116, -0.04534952]), array([-1.59903478, -0.09841566]), array([-1.37155951, -0.00684994]), array([-1.1755283 ,  0.04715285]), array([-1.006981  ,  0.01757082]), array([-0.86228849, -0.01830518]), array([-0.73820492, -0.01427696]), array([-0.63187084,  0.0048695 ]), array([-0.54079155,  0.00859933]), array([-4.62804416e-01,  1.45050014e-04]), array([-0.39604258, -0.00425615]), array([-0.33889911, -0.00134937]), array([-0.28999343,  0.00172326]), array([-0.24814098,  0.00119166]), array([-0.2123263 , -0.00050413]), array([-0.18167938, -0.00074707]), array([-1.55455157e-01,  2.79448010e-05]), array([-0.13301574,  0.00038192]), array([-1.13815082e-01,  1.00603444e-04]), array([-0.09738585, -0.00016078]), array([-8.33280829e-02, -9.85353344e-05]), ar

# **3. RMSProp**

---



In [None]:
def compute_S(s, gamma, dW):
  S_t = gamma*s + (1 - gamma)*(dW**2)
  return S_t

s = np.array([0, 0])
S = compute_S(s = s, gamma = 0.9, dW = dW)
print(S)

[0.1 6.4]


In [None]:
def RMSProp(W, dW, lr, decay, s, gamma):
  """
    RMSProp to update weight

    Arg:
      W:
  """
  S_t = compute_S(s = s, gamma = gamma, dW = dW)
  W = W - lr*dW/np.sqrt(S_t + decay)
  return W, S_t

W, S_t = RMSProp(W = input, dW = dW, lr = 0.3, gamma = 0.9, decay = 10**-6, s = s)
print(W)

[-4.05132145 -1.05131678]


In [None]:
def train_RMSPRop(W, dW, lr, decay, s, gamma, epochs, optim):
  """
    Train the target function

    Args:
      W: Contains w_1 and w_2 (np.array)
      lr: Learning rate (float)
      decay: Decay rate (float)
      s_0: Initial squared gradient (float)
      gamma: Hyper parameter for regularization (float)
      epochs: Number of times trained (int)

    Returns:
      results: Contains updated w_1 and w_2 of each iteration (list)
  """
  results = [W]

  for i in range(epochs):
    dW = df_w(W)
    W, S_t = optim(W = W, dW = dW, lr = lr, decay = decay, s = s, gamma = gamma)
    s = S_t
    results.append(W)

  return results

results = train_RMSPRop(input, dW, lr = 0.3, gamma = 0.9, s = s, optim = RMSProp, epochs = 30, decay = 10**-6)
results2 = train_RMSPRop(input, dW, lr = 0.3, gamma = 0.9, s = s, optim = RMSProp, epochs = 2, decay = 10**-6)
print(results)
print(results2)

[array([-5, -2]), array([-4.05132145, -1.05131678]), array([-3.43519754, -0.59152343]), array([-2.95893693, -0.3294394 ]), array([-2.56546289, -0.17756482]), array([-2.22920552, -0.09163256]), array([-1.93626752, -0.04494499]), array([-1.67817686, -0.02081423]), array([-1.44934985, -0.00903559]), array([-1.24588199, -0.00364591]), array([-1.06490301, -0.00135351]), array([-9.04202260e-01, -4.56444431e-04]), array([-7.61996495e-01, -1.37562928e-04]), array([-6.36778499e-01, -3.62601019e-05]), array([-5.27215237e-01, -8.11337456e-06]), array([-4.32078505e-01, -1.47473412e-06]), array([-3.50198507e-01, -2.02783991e-07]), array([-2.80434649e-01, -1.84231187e-08]), array([-2.21659834e-01, -7.67742748e-10]), array([-1.72755512e-01,  7.80451998e-12]), array([-1.32615134e-01, -5.05794800e-13]), array([-1.00153779e-01,  6.19123501e-14]), array([-7.43217708e-02, -1.13373781e-14]), array([-5.41201278e-02,  2.80166702e-15]), array([-3.86159157e-02, -8.81341191e-16]), array([-2.69558066e-02,  3.399

# **4. Adam**
(t = epoch)
---



In [None]:
def compute_moment(beta_1, v, dW,):
  V_t = beta_1*v + (1 - beta_1)*dW
  return V_t

v = np.array([0, 0])
moment = compute_moment(beta_1 = 0.9, v = v, dW = dW)
print(moment)
print(type(moment))
print(v)

[-0.1 -0.8]
<class 'numpy.ndarray'>
[0 0]


In [None]:
def compute_squared_gradient(beta_2, s, dW):
  S_t = beta_2*s + (1 - beta_2)*(dW**2)
  return S_t

s = np.array([0, 0])
Squared_grad = compute_squared_gradient(beta_2 = 0.999, s = s, dW = dW)
print(Squared_grad)
print(type(Squared_grad))
print(s)

[0.001 0.064]
<class 'numpy.ndarray'>
[0 0]


In [None]:
def moment_correction(V_t, beta_1, epoch):
  V_corr = V_t/(1 - beta_1**epoch)
  return V_corr

def squared_gradient_correction(S_t, beta_2,epoch):
  S_corr = S_t/(1 - beta_2**epoch)
  return S_corr

In [None]:
def Adam(W, dW, lr, beta_1, beta_2, v, s, decay, epoch):
  V_t = compute_moment(beta_1, v, dW)

  S_t = compute_squared_gradient(beta_2, s, dW)

  W = W - lr*((moment_correction(V_t, beta_1, epoch))/(np.sqrt(squared_gradient_correction(S_t, beta_2, epoch)) + decay))
  return W, V_t, S_t


In [None]:
def train_Adam(W, dW, lr, beta_1, beta_2, v, s, decay, epochs, optim):
  results = [W]
  V_t_tracker = [v]
  S_t_tracker = [s]

  for i in range(1, epochs):
    W, V_t, S_t = Adam(W, dW, lr, beta_1, beta_2, v, s, decay, epoch = i)
    v = V_t
    s = S_t

    results.append(W)
    V_t_tracker.append(v)
    S_t_tracker.append(s)

  return results, V_t_tracker, S_t_tracker

In [None]:
results, v_list, s_list = train_Adam(W = input, dW = dW, lr = 0.2, beta_1 = 0.9, beta_2 = 0.999, decay = 10**-6, s = s, v = v, epochs = 30, optim = Adam)
print(results)

[array([-5, -2]), array([-4.8000002 , -1.80000002]), array([-4.6000004 , -1.60000005]), array([-4.4000006 , -1.40000007]), array([-4.2000008, -1.2000001]), array([-4.000001  , -1.00000012]), array([-3.8000012 , -0.80000015]), array([-3.6000014 , -0.60000017]), array([-3.4000016, -0.4000002]), array([-3.2000018 , -0.20000022]), array([-3.00000200e+00, -2.49999972e-07]), array([-2.8000022 ,  0.19999973]), array([-2.6000024,  0.3999997]), array([-2.4000026 ,  0.59999968]), array([-2.2000028 ,  0.79999965]), array([-2.000003  ,  0.99999963]), array([-1.8000032,  1.1999996]), array([-1.6000034 ,  1.39999958]), array([-1.4000036 ,  1.59999955]), array([-1.2000038 ,  1.79999953]), array([-1.000004 ,  1.9999995]), array([-0.8000042 ,  2.19999948]), array([-0.6000044 ,  2.39999945]), array([-0.4000046 ,  2.59999943]), array([-0.2000048,  2.7999994]), array([-4.99999500e-06,  2.99999938e+00]), array([0.1999948 , 3.19999935]), array([0.3999946 , 3.39999933]), array([0.5999944, 3.5999993]), array(