# General steps (simplified):
1. prepare data
2. build model
3. define loss function, and optimizer
4. do epoch iteration:
    1. forward processing: call model_defined() predict y value
    2. backward processing: call loss.backward() to populate gradients
    3. call optimizer.step() to update weights
    4. call optimizer.zero_grad() to clear gradients, and prepare for next epoch iteration.

In [29]:
import torch
batch, dim_in, dim_h, dim_out = 128, 2000, 200, 20

In [30]:
# pepare data
input_X = torch.randn(batch, dim_in)
output_Y = torch.randn(batch, dim_out)

In [31]:
# Build a model
Adam_model = torch.nn.Sequential(
    torch.nn.Linear(dim_in, dim_h),
    torch.nn.ReLU(),
    torch.nn.Linear(dim_h, dim_out),)

In [32]:
# define loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

In [33]:
# define optimizer
rate_learning = 1e-4
optimizer = torch.optim.Adam(Adam_model.parameters(), lr=rate_learning)

In [34]:
# do epoch iteration, each with following steps:
#    1. forward processing: predict y value
#    2. backward processing: call loss.backward() to populate gradients
#    3. call optimizer.step() to update weights
#    4. call optimizer.zero_grad() to clear gradients, and prepare for next epoch iteration.

for epoch in range(500):

  pred_y = Adam_model(input_X)

  loss = loss_fn(pred_y, output_Y)

  loss.backward()
  # Calling .backward() mutiple times accumulates the gradient (by addition) for each parameter.
  # computes the gradients w.r.t. the parameters (those included in optimizer) using backpropagation.
  optimizer.step()
  # call optimizer one time to update weights based on the gradients of the parameters. 2) should not modify the .grad field of the parameters. -->optimizer.step()
  # 1) optimizer.step is performs a parameter update based on the current gradient (stored in .grad attribute of a parameter) and the update rule (Performs a single optimization step); 2) should not modify the .grad field of the parameters.
  optimizer.zero_grad()
  # clears existing gradients from the last step (otherwise you’d just accumulate the next computed gradients from all loss.backward() calls)
  # clears from optimizer gradient group, not directly clear model gradient


  if epoch % 100 == 9:
    print(epoch, loss.item())  # calculate based on the model weights

9 1963.87060546875
109 21.832597732543945
209 0.00417353305965662
309 1.0652828450474772e-06
409 4.0545145019166284e-10


# Miscellaneous

In [6]:
# this code demonstrates how loss function works:
# Calling .backward() mutiple times accumulates the gradient (by addition) for each parameter.

import torch

# Create input tensor
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
output_Y = torch.tensor([2.0, 5.0, 10.0], requires_grad=True)
loss_func = torch.nn.MSELoss(reduction='sum')
for i in range(5):
  # Perform some operations
  y = x ** 2
  loss = loss_func(y, output_Y)   # by using MSELoss, x**2 --> x**4
  optimizer = torch.optim.Adam(Adam_model.parameters(), lr=rate_learning)

  # Compute gradients with respect to x
  loss.backward()

  # Gradients are computed and stored in x.grad
  print(x.grad, loss.item())

tensor([ -4.,  -8., -12.]) 3.0
tensor([ -8., -16., -24.]) 3.0
tensor([-12., -24., -36.]) 3.0
tensor([-16., -32., -48.]) 3.0
tensor([-20., -40., -60.]) 3.0
