In [1]:
# ======================================================
# MANUAL COMPUTATIONAL GRAPH IN PYTORCH (with gradients)
# ======================================================

import torch

torch.set_printoptions(precision=6, sci_mode=False)

# --- Input ---
x = torch.tensor(2.0, requires_grad=True)  # scalar input

# --- Layer 1 (3 neurons) ---
w00 = torch.tensor(0.5, requires_grad=True)
b00 = torch.tensor(0.1, requires_grad=True)

w01 = torch.tensor(-1.2, requires_grad=True)
b01 = torch.tensor(0.0, requires_grad=True)

w02 = torch.tensor(0.7, requires_grad=True)
b02 = torch.tensor(-0.3, requires_grad=True)

# --- Layer 2 (2 neurons) ---
w10 = torch.tensor(1.0, requires_grad=True)
b10 = torch.tensor(0.2, requires_grad=True)

w11 = torch.tensor(-0.5, requires_grad=True)
b11 = torch.tensor(0.3, requires_grad=True)

# --- Output layer ---
w20 = torch.tensor(0.9, requires_grad=True)
b20 = torch.tensor(-0.1, requires_grad=True)

# ======================================================
# Forward pass (manual computation step by step)
# ======================================================

print("=== Forward pass (step-by-step) ===\n")
print(f"Input x = {x.item()}\n")

# --- Layer 1 (ReLU) ---
z00 = w00 * x + b00
a00 = torch.relu(z00)
print("Layer 1 - Neuron 0: z00 =", z00.item(), "-> ReLU =", a00.item())

z01 = w01 * x + b01
a01 = torch.relu(z01)
print("Layer 1 - Neuron 1: z01 =", z01.item(), "-> ReLU =", a01.item())

z02 = w02 * x + b02
a02 = torch.relu(z02)
print("Layer 1 - Neuron 2: z02 =", z02.item(), "-> ReLU =", a02.item())

# --- Layer 2 (Sigmoid) ---
z10 = w10 * x + b10
a10 = torch.sigmoid(z10)
print("\nLayer 2 - Neuron 0: z10 =", z10.item(), "-> Sigmoid =", a10.item())

z11 = w11 * x + b11
a11 = torch.sigmoid(z11)
print("Layer 2 - Neuron 1: z11 =", z11.item(), "-> Sigmoid =", a11.item())

# --- Combine outputs (+) then Tanh ---
sum_layer1 = a00 + a01 + a02
sum_layer2 = a10 + a11
combined_sum = sum_layer1 + sum_layer2
c = torch.tanh(combined_sum)

print("\nSum of Layer 1 activations =", sum_layer1.item())
print("Sum of Layer 2 activations =", sum_layer2.item())
print("Combined sum =", combined_sum.item(), "-> Tanh =", c.item())

# --- Output layer (linear) ---
output = w20 * c + b20
print("\nOutput layer (linear): output =", output.item())

# ======================================================
# Backward pass (compute gradients)
# ======================================================

# Reset gradients if they exist
for t in [x, w00, b00, w01, b01, w02, b02, w10, b10, w11, b11, w20, b20]:
    if t.grad is not None:
        t.grad.zero_()

output.backward()  # automatic differentiation

print("\n=== Gradients (after backward) ===")
print("d(output)/d(x) =", x.grad.item())
print("d(output)/d(w00) =", w00.grad.item())
print("d(output)/d(b00) =", b00.grad.item())
print("d(output)/d(w20) =", w20.grad.item())

# Print all gradients neatly
params = {
    'w00': w00.grad.item(), 'b00': b00.grad.item(),
    'w01': w01.grad.item(), 'b01': b01.grad.item(),
    'w02': w02.grad.item(), 'b02': b02.grad.item(),
    'w10': w10.grad.item(), 'b10': b10.grad.item(),
    'w11': w11.grad.item(), 'b11': b11.grad.item(),
    'w20': w20.grad.item(), 'b20': b20.grad.item()
}
print("\nAll parameter gradients:")
for k, v in params.items():
    print(f" {k:>4}: {v:.6f}")


=== Forward pass (step-by-step) ===

Input x = 2.0

Layer 1 - Neuron 0: z00 = 1.100000023841858 -> ReLU = 1.100000023841858
Layer 1 - Neuron 1: z01 = -2.4000000953674316 -> ReLU = 0.0
Layer 1 - Neuron 2: z02 = 1.0999999046325684 -> ReLU = 1.0999999046325684

Layer 2 - Neuron 0: z10 = 2.200000047683716 -> Sigmoid = 0.9002495408058167
Layer 2 - Neuron 1: z11 = -0.699999988079071 -> Sigmoid = 0.3318122327327728

Sum of Layer 1 activations = 2.1999998092651367
Sum of Layer 2 activations = 1.232061743736267
Combined sum = 3.4320616722106934 -> Tanh = 0.9979130029678345

Output layer (linear): output = 0.7981216311454773

=== Gradients (after backward) ===
d(output)/d(x) = 0.004424192477017641
d(output)/d(w00) = 0.007505349349230528
d(output)/d(b00) = 0.003752674674615264
d(output)/d(w20) = 0.9979130029678345

All parameter gradients:
  w00: 0.007505
  b00: 0.003753
  w01: 0.000000
  b01: 0.000000
  w02: 0.007505
  b02: 0.003753
  w10: 0.000674
  b10: 0.000337
  w11: 0.001664
  b11: 0.000832