In [20]:
import autograd.numpy as np
from autograd import elementwise_grad as egrad

In [21]:
def softmax(z):
    """Compute softmax values for each set of scores in the rows of the matrix z.
    Used with batched input data."""
    e_z = np.exp(z - np.max(z, axis=0))
    return e_z / np.sum(e_z, axis=1)[:, np.newaxis]

def softmax_der(z):
    s = z.reshape(-1, 1)
    return np.diagflat(s) - np.dot(s, s.T)

In [22]:
x = np.random.rand(10, 2)
target = np.random.rand(10, 3)

W = np.random.randn(3, 2)
b = np.random.randn(3)

z = np.dot(x, W.T) + b
a = softmax(z)

print(z.shape)

predict = a

cost_autograd = egrad(softmax)
print(cost_autograd(z))

(10, 3)
[[ 1.89777488e-17  7.43429741e-17  4.27586827e-17]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-2.46661134e-17 -7.49693868e-17 -4.75161199e-17]
 [ 0.00000000e+00 -7.34730771e-17 -6.41281258e-17]
 [-6.52933304e-17  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 7.09816949e-17  7.40994898e-17  6.88855630e-17]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]]


In [18]:
import autograd.numpy as np
from autograd import elementwise_grad as egrad

# Define the sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Compute the elementwise derivative of the sigmoid function
sigmoid_derivative = egrad(sigmoid)

# Example usage
z = np.array([1.0, 0.5, -1.0, -2.0])

print("Sigmoid output:")
print(sigmoid(z))
print("\nSigmoid derivative:")
print(sigmoid_derivative(z))


Sigmoid output:
[0.73105858 0.62245933 0.26894142 0.11920292]

Sigmoid derivative:
[0.19661193 0.23500371 0.19661193 0.10499359]


In [34]:
A = np.random.randn(10,4)

A*1

array([[ 1.21493917, -0.28186989,  1.01441017, -0.47070382],
       [-0.73284647,  0.67150707, -0.79297654,  0.80297338],
       [ 0.33264356,  0.23206387,  1.15950692, -0.37018466],
       [-0.18122367,  0.33136183,  1.09711807, -0.51060525],
       [ 0.0333128 , -0.42454234,  0.64482676,  0.04162912],
       [ 0.59291351, -0.64410062,  1.30468255,  2.10387902],
       [ 0.45163839, -1.69835   ,  0.50759397, -0.70513188],
       [ 0.17985967,  0.33729287, -0.04375559,  0.37530175],
       [ 0.64510415, -0.83809014,  0.81016584, -0.70867628],
       [ 0.99644821,  0.15172525, -1.0423762 , -0.99755834]])

In [37]:
import autograd.numpy as np
from autograd import grad, jacobian
from autograd import elementwise_grad as egrad

# Define the softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max(x) for numerical stability
    return exp_x / np.sum(exp_x)

# Example input
x = np.random.rand(10,4)

# Compute softmax
softmax_output = softmax(x)
print("Softmax output:", softmax_output)

# Compute the gradient of the softmax function at x
# Note: usually we want the Jacobian for elementwise derivatives in the case of softmax
softmax_jacobian = egrad(softmax)(x)
print("Softmax Jacobian:\n", softmax_jacobian)


Softmax output: [[0.02366074 0.01681857 0.03601313 0.01935889]
 [0.02618726 0.01881051 0.02927465 0.02583389]
 [0.01652932 0.02654158 0.01496591 0.01720058]
 [0.02370359 0.0275912  0.01659028 0.03136642]
 [0.03339364 0.03433433 0.02738851 0.03332417]
 [0.02250676 0.02568864 0.01546628 0.01911602]
 [0.02784559 0.03048616 0.01745993 0.01946786]
 [0.02049598 0.03593386 0.01733864 0.02980971]
 [0.0321476  0.03335605 0.03334036 0.02448517]
 [0.03619528 0.02157439 0.0208496  0.01754895]]
Softmax Jacobian:
 [[-4.53593325e-18 -3.22423992e-18 -6.90397450e-18 -3.71123685e-18]
 [-5.02028490e-18 -3.60610820e-18 -5.61215860e-18 -4.95254161e-18]
 [-3.16878786e-18 -5.08821054e-18 -2.86907222e-18 -3.29747417e-18]
 [-4.54414631e-18 -5.28942906e-18 -3.18047506e-18 -6.01316640e-18]
 [-6.40179883e-18 -6.58213623e-18 -5.25057239e-18 -6.38848179e-18]
 [-4.31470652e-18 -4.92469493e-18 -2.96499648e-18 -3.66467818e-18]
 [-5.33819785e-18 -5.84441330e-18 -3.34719304e-18 -3.73212729e-18]
 [-3.92922490e-18 -6.8887

In [None]:
def mse_l2_der(predict, target, weights, lambda_reg):
    # Gradient of the MSE term
    mse_gradient = 2 / predict.size * (predict - target)
    
    # Gradient of the L2 regularization term
    l2_gradient = 2 * lambda_reg * weights
    
    # Sum of the gradients
    total_gradient = mse_gradient + l2_gradient

    return total_gradient

In [38]:
z = np.random.randn(10, 4)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_der_grad(z):
    return egrad(sigmoid)(z)

def sigmoid_der(z):
    return sigmoid(z)*(1-sigmoid(z))

print(sigmoid_der(z))
print(sigmoid_der_grad(z))

[[0.22609278 0.24855947 0.13398122 0.24814066]
 [0.22149882 0.20714666 0.23951874 0.21609952]
 [0.15288849 0.19921089 0.2153663  0.24983543]
 [0.22703886 0.23332504 0.24410694 0.08253783]
 [0.19842147 0.23966594 0.24709208 0.24857436]
 [0.20111794 0.24851464 0.21883296 0.2082233 ]
 [0.24063356 0.21745923 0.21240246 0.17108946]
 [0.24196704 0.09895118 0.23891164 0.16254692]
 [0.24999344 0.2499343  0.11564915 0.23774778]
 [0.10947212 0.19541126 0.22512821 0.19828641]]
[[0.22609278 0.24855947 0.13398122 0.24814066]
 [0.22149882 0.20714666 0.23951874 0.21609952]
 [0.15288849 0.19921089 0.2153663  0.24983543]
 [0.22703886 0.23332504 0.24410694 0.08253783]
 [0.19842147 0.23966594 0.24709208 0.24857436]
 [0.20111794 0.24851464 0.21883296 0.2082233 ]
 [0.24063356 0.21745923 0.21240246 0.17108946]
 [0.24196704 0.09895118 0.23891164 0.16254692]
 [0.24999344 0.2499343  0.11564915 0.23774778]
 [0.10947212 0.19541126 0.22512821 0.19828641]]


In [39]:
def ReLU(z):
    return np.where(z > 0, z, 0)

def ReLU_der_grad(z):
    return egrad(ReLU)(z)

def ReLU_der(z):
    return np.where(z > 0, 1, 0)

print(ReLU_der(z))
print(ReLU_der_grad(z))

[[0 1 1 1]
 [0 0 0 1]
 [1 1 1 0]
 [0 0 0 0]
 [1 0 0 0]
 [1 0 0 1]
 [1 1 0 1]
 [0 0 0 1]
 [0 1 0 0]
 [0 1 0 0]]
[[0. 1. 1. 1.]
 [0. 0. 0. 1.]
 [1. 1. 1. 0.]
 [0. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 1.]
 [1. 1. 0. 1.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]


In [64]:
z = np.random.randn(10, 4)

def softmax(z):
    exp_z = np.exp(z)  # Subtract max(x) for numerical stability
    return exp_z / np.sum(exp_z)

def softmax_der_grad(z):
    return egrad(softmax)(z)

print(z)
print(softmax_der_grad(z))

[[-1.28512355  0.83136969  0.21450072 -1.02920839]
 [-1.20814071  0.80586152 -0.22971024 -0.17018107]
 [ 1.84936444 -0.30434673 -0.16187406 -0.6272463 ]
 [-0.64385156  2.20910342  1.10462189  1.09180923]
 [-1.49620558  1.75652712  0.69148958 -0.36963882]
 [-0.3725202  -1.07360897  1.31449227 -1.68701463]
 [-0.74932253 -0.04800822  0.93336573  0.46084589]
 [-0.54594978 -0.49846628 -0.16373159 -0.03328697]
 [ 0.5375391   0.26045013 -0.3841128   0.05669642]
 [-0.00555607  0.80616488 -0.59918752  3.13231964]]
[[-4.79852974e-19 -3.98372661e-18 -2.14974395e-18 -6.19798802e-19]
 [-5.18252506e-19 -3.88339411e-18 -1.37869552e-18 -1.46326018e-18]
 [-1.10255186e-17 -1.27954083e-18 -1.47546614e-18 -9.26447150e-19]
 [-9.11190272e-19 -1.57990757e-17 -5.23553958e-18 -5.16888631e-18]
 [-3.88540621e-19 -1.00480154e-17 -3.46370077e-18 -1.19866592e-18]
 [-1.19521708e-18 -5.92881373e-19 -6.45813316e-18 -3.21047366e-19]
 [-8.19980674e-19 -1.65340996e-18 -4.41149550e-18 -2.75025576e-18]
 [-1.00491025e-18 -1

In [65]:
np.random.randn(2, 3, 4)

array([[[-0.61911191, -0.84569969, -0.79487727,  0.4855416 ],
        [-0.14616863, -0.77348166,  0.34095362,  0.16378469],
        [-0.84814776,  0.10529887, -0.62060467, -1.2032186 ]],

       [[-2.71992471, -0.15978696,  0.54406951, -0.72470892],
        [-2.06313436,  0.40859718,  0.50470003, -0.34433452],
        [-0.01781458,  0.48749794, -0.03037353,  1.24603728]]])