In [1]:
import torch
import random
import numpy as np
import os

SEED=42
def seed_everything(SEED=42):
    random.seed(SEED)
#     os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmarks = False

# Gradient

In [68]:
seed_everything()
VECTOR_LENGTH = (5, 1)

x = torch.rand(VECTOR_LENGTH, requires_grad=True)
w = torch.rand(VECTOR_LENGTH, requires_grad=True)
f = torch.matmul(w.T, x)

f.backward()
w.grad

print('x:\n', x)
print('w:\n', w)
print()
print('f = wTx: ', f.item())
print('df/dw = [df/dw1, df/dw2, ... , df/dwn]^T = x with shape {dim:}: {prof:}'.format(dim=x.shape, prof=all(x == w.grad)))
print('df/dx = [df/dx1, df/dx2, ... , df/dxn]^T = w with shape {dim:}: {prof:}'.format(dim=w.shape, prof=all(w == x.grad)))


x:
 tensor([[0.8823],
        [0.9150],
        [0.3829],
        [0.9593],
        [0.3904]], requires_grad=True)
w:
 tensor([[0.6009],
        [0.2566],
        [0.7936],
        [0.9408],
        [0.1332]], requires_grad=True)

f = wTx:  2.0232625007629395
df/dw = [df/dw1, df/dw2, ... , df/dwn]^T = x with shape torch.Size([5, 1]): True
df/dx = [df/dx1, df/dx2, ... , df/dxn]^T = w with shape torch.Size([5, 1]): True


In [69]:
from torch.autograd.functional import jacobian
seed_everything()

ROWS, COLUMNS = (20, 5)
WEIGHTS_DIM = (ROWS, COLUMNS)
VECTOR_LENGTH = (COLUMNS, 1)

W = torch.rand(WEIGHTS_DIM, requires_grad=True)
x = torch.rand(VECTOR_LENGTH, requires_grad=False)

def predict(W, x):
    return torch.matmul(W, x)

pred = predict(W, x)




def compute_jac(weights, x):
    unit_vectors = torch.eye(ROWS)
    jacobian_rows = [torch.autograd.grad(predict(weights, x.squeeze(1)), weights, vec)[0]
                     for vec in unit_vectors]
    return torch.stack(jacobian_rows).sum(0)

print('forward\npredicitons = Wx: R^{dim1} -> R^{dim2}\n'.format(dim1=x.size(0), dim2=pred.size(0)))


# backward

jac = torch.matmul(torch.ones((ROWS, 1)), x.T)
print('dWx / dx = \n[[dw_1Tx / dw_1, dw_1Tx / dw_2, ... , dw_1Tx / dx_n]\n [dw_2Tx / dw_1, dw_2Tx / dw_2, ... , dw_2Tx / dw_n]\n... \n [dw_mTx / dw_1, dw_mTx / dw_2, ... , dw_mTx / dw_n]]\n')
print('Dimentions: ones: {dim_1}, x: {dim_x}, dWx / dx: {dim3}'.format(dim_1=(ROWS, 1), dim_x=VECTOR_LENGTH, dim3=jac.shape))
print('dWx / dx = ones.T xT : ', all((jac == compute_jac(W, x)).flatten()))


forward
predicitons = Wx: R^5 -> R^20

dWx / dx = 
[[dw_1Tx / dw_1, dw_1Tx / dw_2, ... , dw_1Tx / dx_n]
 [dw_2Tx / dw_1, dw_2Tx / dw_2, ... , dw_2Tx / dw_n]
... 
 [dw_mTx / dw_1, dw_mTx / dw_2, ... , dw_mTx / dw_n]]

Dimentions: ones: (20, 1), x: (5, 1), dWx / dx: torch.Size([20, 5])
dWx / dx = ones.T xT :  True


# Gradient for Loss functions 

In [70]:
seed_everything()

VECTOR_LENGTH = (5, 1)

pred = torch.rand(VECTOR_LENGTH, requires_grad=True)
y = torch.rand(VECTOR_LENGTH)

mse = torch.mean((y - pred) ** 2)
mse.backward()


n = VECTOR_LENGTH[0]
grad_mse = - 2/n * (y - pred)

print('d MSE / d pred_i = - 1/n * 2 (y_i - pred_i)')
print('d MSE / d pred = [d MSE / d pred_1, d MSE / d pred_2, ... , d MSE / d pred_n]^T: ',  all(grad_mse == pred.grad))

d MSE / d pred_i = - 1/n * 2 (y_i - pred_i)
d MSE / d pred = [d MSE / d pred_1, d MSE / d pred_2, ... , d MSE / d pred_n]^T:  True


# Simple backpropagation
```
forward:
pred = Wx
MSE(y, pred)

backward:
dMSE/dW = 1/M {Sum_i dERROR_i/dW for i in range(m)}
dERROR_i/dW = (d ERROR_i / d pred_i)*(d pred_i / dW) 
[m * n] = [1 * 1] * [m * n]
```

In [197]:
seed_everything()
ROWS, COLUMNS = (20, 5)
VECTOR_LENGTH = (COLUMNS, 1)

x = torch.rand(VECTOR_LENGTH)
W = torch.rand((ROWS, COLUMNS), requires_grad=True)
n = ROWS
y = torch.rand((ROWS, 1))

print('forward\npredicitons = Wx: R^{dim1} -> R^{dim2}'.format(dim1=x.size(0), dim2=y.size(0)))
print('Expected dimention: ', (ROWS, COLUMNS))
print()

# forward
pred = torch.matmul(W, x)
mse = torch.mean((y - pred) ** 2)

W.zero_
mse.backward()
print('dMSE / dW: ', W.grad.shape)

# backward
dMSE_dpred = -2/n * (y - pred)

jacobian_predictions = torch.zeros((ROWS, ROWS, COLUMNS))
for i in range(ROWS):
    jacobian_predictions[i][i] = x.squeeze(1)

    
jac = torch.zeros((ROWS, COLUMNS))
for i in range(ROWS):
    jac += dMSE_dpred[i] * jacobian_predictions[i]
   
print('dMSE / dW = SUM_i ( dMSE / dpred_i * dpred_i / dW ) for i in range(ROWS): ', all((jac == W.grad).flatten()))
print('Dimentions: ', dMSE_dpred[0].shape, jacobian_predictions.shape)

forward
predicitons = Wx: R^5 -> R^20
Expected dimention:  (20, 5)

dMSE / dW:  torch.Size([20, 5])
dMSE / dW = SUM_i ( dMSE / dpred_i * dpred_i / dW ) for i in range(ROWS):  True
Dimentions:  torch.Size([1]) torch.Size([20, 20, 5])


## Introduct backpropagation

In [198]:
dMSE_dpred = -2/n * (y - pred)
jac2 = torch.matmul(dMSE_dpred, x.T)
all((jac2 == W.grad).flatten())

True

# Simple neural network

```
forward:
x_1 = W_1 x_0
[m, 1] = [n * m] [n * 1]

pred = W_2 x_1
[n, 1] = [n * m] [m * 1]

MSE(y, pred)
[1] = [1, n][n, 1]

backward:
dMSE/dW1 = [[dMSE/dpred]T [dpred/dx_1]]T x_0.T 
[m * n] = [1 * 1] * [m * n]
```



In [195]:
seed_everything()
ROWS1, COLUMNS1 = (9, 5)
ROWS2, COLUMNS2 = (7, 9)

VECTOR_LENGTH = (COLUMNS, 1)

x = torch.rand(VECTOR_LENGTH)
W1 = torch.rand((ROWS1, COLUMNS1), requires_grad=True)
W2 = torch.rand((ROWS2, COLUMNS2), requires_grad=True)

n = ROWS2
# print('forward\npredicitons = Wx: R^{dim1} -> R^{dim2} -> R^{dim3}'.format(dim1=x.size(0), dim2=, dim3=y.size(0)))
print('Expected dimention: ', (ROWS1, COLUMNS1))
print()
y = torch.rand((ROWS2, 1))

# forward
pred = torch.matmul(W2, torch.matmul(W1,x))
mse = torch.mean((y - pred) ** 2)

W.zero_
mse.backward()

# Dervatives = Gradient.T
dMSE_dpred = -2/ROWS2 * (y - pred).T
dMSE_dx1 = torch.matmul(dMSE_dpred, W2)
dMSE_dW1 = torch.matmul(x, dMSE_dx1)




print(dMSE_dpred.shape, '->', dMSE_dx1.shape, '->', dMSE_dW1.shape)
print('to gradient: ', dMSE_dW1.T.shape)
print(W1.grad.shape, '\n', all((dMSE_dW1.T == W1.grad).flatten()))

Expected dimention:  (9, 5)

torch.Size([1, 7]) -> torch.Size([1, 9]) -> torch.Size([5, 9])
to gradient:  torch.Size([9, 5])
torch.Size([9, 5]) 
 True
