In [102]:
import random
import math

In [103]:
# Define the sigmoid function
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [104]:
K = 3

In [105]:
# Initialize weights A, B, and C with random values
A = [[random.random() for _ in range(K)] for _ in range(K)]
B = [[random.random() for _ in range(K)] for _ in range(K)]
C = [[random.random() for _ in range(K)] for _ in range(K)]
print("A:", A)
print("B:", B)
print("C:", C)

A: [[0.8398498040460971, 0.3041809126703864, 0.423400014612931], [0.75829482384759, 0.6198137716104942, 0.266505019491165], [0.48248312349313216, 0.5801489941487755, 0.9794073063196495]]
B: [[0.32947973954781484, 0.06653397243383918, 0.5466334362443342], [0.260879676811679, 0.9464011022017689, 0.25929303196883], [0.6602652197959822, 0.8015934833696401, 0.3451858452102099]]
C: [[0.8710006119005334, 0.6971948493063583, 0.3888061955552716], [0.014833275024055959, 0.2638647492326325, 0.7988204444745216], [0.848068322108391, 0.8879014149258482, 0.17142299209922907]]


In [106]:
# Define the input vector x
x = [random.random() for _ in range(K)]
print("x:",x)

x: [0.49018457619705547, 0.5881632037617703, 0.6883674131063857]


In [107]:
# Forward pass
y = [sum(A[i][j] * x[j] for j in range(K)) for i in range(K)]
u = [sigmoid(y_i) for y_i in y]
v = [sum(B[i][j] * x[j] for j in range(K)) for i in range(K)]
z = [u_i + v_i for u_i, v_i in zip(u, v)]
w = [sum(C[i][j] * z[j] for j in range(K)) for i in range(K)]
L = sum(w_i ** 2 for w_i in w)

In [110]:
# Backward pass
# Compute gradients ∂L/∂A, ∂L/∂B, and ∂L/∂C
dL_dw = [2 * w_i for w_i in w]
dL_dz = [sum(C[j][i] * dL_dw[j] for j in range(K)) for i in range(K)]
dL_du = dL_dz
dL_dv = dL_dz
dL_dy = [dL_du[i] * u[i] * (1 - u[i]) for i in range(K)]
dL_dA = [[dL_dy[i] * x[j] for j in range(K)] for i in range(K)]
dL_dB = [[dL_dv[i] * x[j] for j in range(K)] for i in range(K)]
dL_dC = [[dL_dw[i] * z[j] for j in range(K)] for i in range(K)]

In [111]:
# Print the gradients
print("Gradients ∂L/∂A:")
for row in dL_dA:
    print(row)

print("Gradients ∂L/∂B:")
for row in dL_dB:
    print(row)

print("Gradients ∂L/∂C:")
for row in dL_dC:
    print(row)

Gradients ∂L/∂A:
[1.0044650439867402, 1.2052386122007435, 1.4105727463913171]
[1.0030341434791714, 1.2035217017395343, 1.4085633292683846]
[0.5288232865647174, 0.6345250616875721, 0.7426278496707178]
Gradients ∂L/∂B:
[4.85133414423115, 5.821024102649029, 6.8127405413029365]
[4.922085409559319, 5.905917208034879, 6.912096718247142]
[3.058188541355181, 3.6694626015077043, 4.294621734809499]
Gradients ∂L/∂C:
[7.506121133163198, 9.223535575378872, 10.58180747689005]
[4.832538234252468, 5.938231948599319, 6.8127050326484495]
[7.192631683253187, 8.838319157677333, 10.139863502614492]


In [112]:
import torch
import numpy as np

In [113]:
# Define A, B, and C as tensors
A_torch = torch.tensor(A, requires_grad=True)
B_torch = torch.tensor(B, requires_grad=True)
C_torch = torch.tensor(C, requires_grad=True)

In [114]:
# Define x as a tensor
x_torch = torch.tensor(x, requires_grad=False)

In [115]:
y_torch = torch.matmul(A_torch, x_torch)
u_torch = torch.sigmoid(y_torch)
v_torch = torch.matmul(B_torch, x_torch)
z_torch = u_torch + v_torch
w_torch = torch.matmul(C_torch, z_torch)
L_torch = torch.norm(w_torch)**2

In [116]:
# Compute gradients using automatic differentiation
L_torch.backward()

# Access the gradients
dL_dA_torch = A_torch.grad
dL_dB_torch = B_torch.grad
dL_dC_torch = C_torch.grad

In [117]:
print("Gradients ∂L/∂A:")
print(dL_dA_torch)
print("Gradients ∂L/∂B:")
print(dL_dB_torch)
print("Gradients ∂L/∂C:")
print(dL_dC_torch)

Gradients ∂L/∂A:
tensor([[1.0045, 1.2052, 1.4106],
        [1.0030, 1.2035, 1.4086],
        [0.5288, 0.6345, 0.7426]])
Gradients ∂L/∂B:
tensor([[4.8513, 5.8210, 6.8127],
        [4.9221, 5.9059, 6.9121],
        [3.0582, 3.6695, 4.2946]])
Gradients ∂L/∂C:
tensor([[ 7.5061,  9.2235, 10.5818],
        [ 4.8325,  5.9382,  6.8127],
        [ 7.1926,  8.8383, 10.1399]])


In [118]:
# Compare with manually computed gradients (grad_A, grad_B, grad_C)
# You can use np.allclose() to check if the gradients are approximately equal.
is_equal_A = np.allclose(dL_dA, dL_dA_torch.numpy())
is_equal_B = np.allclose(dL_dB, dL_dB_torch.numpy())
is_equal_C = np.allclose(dL_dC, dL_dC_torch.numpy())
print("Gradients (A):", is_equal_A)
print("Gradients (B):", is_equal_B)
print("Gradients (C):", is_equal_C)


Gradients (A): True
Gradients (B): True
Gradients (C): True


In [119]:
# Update weights using gradient descent
learning_rate = 0.01
A = [[A[i][j] - learning_rate * dL_dA[i][j] for j in range(K)] for i in range(K)]
B = [[B[i][j] - learning_rate * dL_dB[i][j] for j in range(K)] for i in range(K)]
C = [[C[i][j] - learning_rate * dL_dC[i][j] for j in range(K)] for i in range(K)]

print("Updated A:")
for row in A:
    print(row)

print("Updated B:")
for row in B:
    print(row)

print("Updated C:")
for row in C:
    print(row)

Updated A:
[0.8298051536062298, 0.29212852654837895, 0.4092942871490178]
[0.7482644824127982, 0.6077785545930989, 0.25241938619848114]
[0.477194890627485, 0.5738037435318998, 0.9719810278229423]
Updated B:
[0.28096639810550333, 0.008323731407348892, 0.4785060308313049]
[0.21165882271608583, 0.88734193012142, 0.19017206478635862]
[0.6296833343824304, 0.7648988573545631, 0.3022396278621149]
Updated C:
[0.7959394005689013, 0.6049594935525696, 0.28298812078637114]
[-0.03349210731846872, 0.20448242974663927, 0.7306933941480371]
[0.7761420052758592, 0.7995182233490749, 0.07002435707308415]
