In [None]:
import torch
import torch.optim as optim

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
  torch.cuda.manual_seed_all(777)

In [None]:
# Backporpagation
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

In [None]:
# nn Layers
w1 = torch.Tensor(2, 2).to(device)
w2 = torch.Tensor(2, 1).to(device)
b1 = torch.Tensor(2).to(device)
b2 = torch.Tensor(1).to(device)

# Sigmoid Function
def sigmoid(x):
  return 1.0/(1.0 + torch.exp(-x))

# Derivative of Sigmoid Function
def sigmoid_prime(x):
  return sigmoid(x) * (1 - sigmoid(x))

In [None]:
# learning rate
learning_rate = 0.5

In [None]:
for step in range(10001):
  # forward()
  l1 = torch.add(torch.matmul(X, w1), b1)
  a1 = sigmoid(l1)
  l2 = torch.add(torch.matmul(a1, w2), b2)
  Y_pred = sigmoid(l2)

  # Binary Cross Entropy
  cost = -torch.mean(Y * torch.log(Y_pred) + (1- Y) * torch.log(1 - Y_pred))

  # backpropagation (chain rule)
  # Loss derivative
  d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)      # 1e-7 은 0으로 나눠지는 것을 방지하기 위해 추가함

  # Layer 2
  d_l2 = d_Y_pred * sigmoid_prime(l2)
  d_b2 = d_l2
  d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2)
  # transpose() 함수는 첫번째 인자의 Tensor 를 두번째 인자와 세번째 인자의 차원축을 바꾸는 기능을 한다.
  # 즉 10 * 5 Tensor 가 있고 transpose(Tensor, 0, 1) 을 적용하면, 5 * 10 Tensor 가 되는 것이다.

  # Layer 1
  d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
  d_l1 = d_a1 * sigmoid_prime(l1)
  d_b1 = d_l1
  d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_b1)

  # weight update
  w1 = w1 - learning_rate * d_w1                  # gradient ascent 를 하려면 + 로 바꾸면 됨
  b1 = b1 - learning_rate * torch.mean(d_b1, 0)
  w2 = w2 - learning_rate * d_w2
  b2 = b2 - learning_rate * torch.mean(d_b2, 0)

  if step % 100 == 0:
    print(step, cost.item())