In [61]:
# Goal: Implement a NN that adds two bits (a0 and b0) and outputs the sum (a0+b0 = c1c0)

import torch
import torch.nn as nn

# Define the neural network model
class AdderNN(nn.Module):
  def __init__(self):
    super(AdderNN, self).__init__()
    self.fc1 = nn.Linear(2, 3)  # Input: 2 bits, Output: 3 neurons
    self.fc2 = nn.Linear(3, 2)  # Input: 3 neurons, Output: 2 bits (sum)
    self.relu = torch.nn.ReLU()

  def forward(self, x): # [MOST IMPORTANT] the NN is defined here
    h = self.relu(self.fc1(x)) # x is the input , h is the hidden activations
    y = self.fc2(h) # h is the hidden activations, y is the final output
    return y


In [62]:
# prompt: create a binary addition dataset where x = [a, b], y = [d, c] where (dc)2 = (a)2 + (b)2

def generate_binary_addition_dataset(num_samples):
  """Generates a dataset for binary addition."""

  x_data = []
  y_data = []

  for _ in range(num_samples):
    a = torch.randint(0, 2, (1,)).item()
    b = torch.randint(0, 2, (1,)).item()

    # Calculate the sum in decimal
    decimal_sum = a + b

    # Convert the sum to binary (maximum 2 bits needed)
    if decimal_sum == 0:
      c = 0
      d = 0
    elif decimal_sum == 1:
      c = 1
      d = 0
    elif decimal_sum == 2:
      c = 0
      d = 1

    x_data.append([a, b])
    y_data.append([d, c])

  return torch.tensor(x_data, dtype=torch.float32), torch.tensor(y_data, dtype=torch.float32)

# Generate the dataset
x_train, y_train = generate_binary_addition_dataset(1000)

In [63]:
import torch.optim as optim

# Create the model
model = AdderNN()

# Define the optimizer (SGD)
optimizer = optim.SGD(model.parameters(), lr=0.1) # GD, SGD, minibatch SGD, depending on how we measure the loss
# SGD: w_next = w_now - lr * gradient
# gradient = computed using backpropagation (aka dynamic programming for efficient gradient compute)

# Define the loss function (mean squared error loss)
criterion = nn.MSELoss()

# Training loop
num_iterations = 1000
for iter in range(num_iterations):
  # Forward pass
  outputs = model(x_train)
  # F(x), F is the neural net, x is the training input
  loss = criterion(outputs, y_train)
  # L(F(x), y), L is the loss, y is the training label

  # Backward pass and optimization
  optimizer.zero_grad() # [very important]
  # for each parameter, they have two memory slots
  # w, dL/dw
  # dL/dW is not reset unless you say so
  # optimizer.zero_grad() => it cleans up the gradient slots of
  # all the variables specified in the optimizer
  # After this, it becomes dL/dw = 0 for all w's in the given list
  loss.backward() # one line magic call to run the backpropagation step
  # After this, dL/dw = populated by backprop
  optimizer.step()
  # Excutes one step of SGD: w_next = w_now - lr * gradient
  # After this, w is updated as shown above
  # dL/dW stays there

  if (iter + 1) % 100 == 0:
    print(f'Epoch [{iter+1}/{num_iterations}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
  outputs = model(x_train)
  predicted = (outputs > 0.5).float()
  accuracy = (predicted == y_train).all(dim=1).float().mean()
  print(f'Accuracy: {accuracy.item():.4f}')

Epoch [100/1000], Loss: 0.1303
Epoch [200/1000], Loss: 0.1008
Epoch [300/1000], Loss: 0.0863
Epoch [400/1000], Loss: 0.0703
Epoch [500/1000], Loss: 0.0565
Epoch [600/1000], Loss: 0.0494
Epoch [700/1000], Loss: 0.0457
Epoch [800/1000], Loss: 0.0434
Epoch [900/1000], Loss: 0.0420
Epoch [1000/1000], Loss: 0.0410
Accuracy: 1.0000


In [64]:
x = torch.Tensor([[0,0], [0,1], [1, 0], [1,1]]) # batch size =1 , the only input is [1,0]
model(x) # all correct!

tensor([[ 0.2950,  0.3536],
        [-0.1639,  0.7644],
        [ 0.0182,  1.0013],
        [ 0.8698, -0.1142]], grad_fn=<AddmmBackward0>)

In [65]:
# prompt: convert model.parameters()  to list

parameter_list = list(model.parameters())
print(parameter_list)

[Parameter containing:
tensor([[ 1.3319, -1.1752],
        [ 1.1975, -0.4550],
        [ 0.1733, -0.6872]], requires_grad=True), Parameter containing:
tensor([-0.0006,  0.4568, -0.1736], requires_grad=True), Parameter containing:
tensor([[-1.1151,  1.0085,  0.1972],
        [ 1.2988, -0.9030, -0.2400]], requires_grad=True), Parameter containing:
tensor([-0.1657,  0.7661], requires_grad=True)]


In [66]:
# Consider fi_{wi}(x) = wi * x
# Then, dfi/dwi = x, and dfi/dx = wi

f = f1_{w1} ( f2_{w2} (f3_{w3} (x)))
l = (f(x) - y)^2
l = (f1(f2(f3(x))) - y)^2
dl/df1 = 2(f1 - y)

[dl / dw1, dl / dw2, dl / dw3]

dl/dw1 = dl/df1 * df1/dw1                       = 2(f1 - y) * f2
dl/dw2 = dl/df1 * df1/df2 * df2/dw2             = 2(f1 - y) * w1 * f3
dl/dw3 = dl/df1 * df1/df2 * df2/df3 * df3/dw3   = 2(f1 - y) * w1 * w2 * x

SyntaxError: invalid syntax (936715431.py, line 4)