In [1]:
import torch

In [3]:
N = 1000
d = 48

X = torch.randn((N, d))
w = torch.randn((d, 1))
b = torch.randn(1)
y = torch.randn((N, 1))

In [4]:
# Manual Pass
z = X @ w + b
a = z.clamp_min(0)
mse_loss = (a - y).pow(2).mean(dim=0) 

z.shape, a.shape, a.min(), mse_loss

(torch.Size([1000, 1]), torch.Size([1000, 1]), tensor(0.), tensor([22.4021]))

### TODOs
* Add `forward` method

In [54]:
class Linear():
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def __call__(self, x):
        return x @ self.w + self.b


class Relu():
    def __init__(self):
        pass

    def __call__(self, a):
        return a.clamp_min(0)


class MSE():
    def __init__(self):
        pass

    def __call__(self, target, pred):
        return (target - pred).pow(2).mean(dim=0)


class Model():

    def __init__(self, w, b):
        self.layers = [
            Linear(w, b),
            Relu()
        ]
        self.loss = MSE()

    def __call__(self, x, y):
        for layer in self.layers:
            x = layer(x)
        return self.loss(y, x)
    

### Ensure the forward Pass is correct

In [55]:
# No wrapping via Model
linear_layer1 = Linear(w, b)
z = linear_layer1(X)
a = Relu()(z)
MSE()(y, a)

tensor([22.4021])

In [56]:
# Wrapping via Model
model = Model(w, b)
model(X, y)

tensor([22.4021])

### Compute gradient via pytorch so we have a baseline to compare against

In [67]:
class Linear():
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def __call__(self, x):
        return x @ self.w + self.b


class Relu():
    def __init__(self):
        pass

    def __call__(self, a):
        a.retain_grad()
        self.a = a
        return a.clamp_min(0)


class MSE():
    def __init__(self):
        pass

    def __call__(self, target, pred):
        pred.retain_grad()
        self.target = target
        self.pred = pred
        return (target - pred).pow(2).mean(dim=0)


class Model():

    def __init__(self, w, b):
        self.layers = [
            Linear(w, b),
            Relu()
        ]
        self.loss = MSE()

    def __call__(self, x, y):
        for layer in self.layers:
            x = layer(x)
        return self.loss(y, x)
    

In [68]:
def mkgrad(x): 
    return x.clone().requires_grad_(True)

chks = w, b, X
ptgrads = w_prime, b_prime, X_prime = tuple(map(mkgrad, chks))

In [69]:
model_prime = Model(w_prime, b_prime)
loss_prime = model_prime(X_prime, y)
loss_prime.backward()

# w_prime.grad, b_prime.grad, model_prime.loss.pred.grad[0:10], model_prime.layers[-1].a.grad[0:10]

### Update classes to compute gradient

In [50]:
class Linear():
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def __call__(self, x):
        return x @ self.w + self.b

    def backward(self):
        pass


class Relu():
    def __init__(self):
        pass

    def __call__(self, a):
        return a.clamp_min(0)
    
    def backward(self):
        pass


class MSE():
    def __init__(self):
        pass

    def __call__(self, target, pred):
        self.pred = pred
        self.target = target
        self.out = (target - pred).pow(2).mean(dim=0)
        return self.out 
    
    def backward(self):
        N = self.target.shape[0]
        pred = self.pred
        target = self.target

        dloss_dpred = (2 / N) * (pred - target)
        self.pred.g = dloss_dpred



class Model():
    def __init__(self, w, b):
        self.layers = [
            Linear(w, b),
            Relu()
        ]
        self.loss = MSE()

    def __call__(self, x, y):
        for layer in self.layers:
            x = layer(x)
        return self.loss(y, x)
    
    def backward(self):
        pass

In [51]:
# Wrapping via Model
model = Model(w, b)
loss = model(X, y)
loss

tensor([22.4021])

In [52]:
model.loss.backward()

In [53]:
model.loss.pred.g[0:10]

tensor([[-0.0011],
        [ 0.0007],
        [ 0.0097],
        [ 0.0003],
        [ 0.0120],
        [ 0.0025],
        [-0.0007],
        [ 0.0117],
        [ 0.0010],
        [ 0.0007]])

tensor([[-0.0011],
        [ 0.0007],
        [ 0.0097],
        [ 0.0003],
        [ 0.0120],
        [ 0.0025],
        [-0.0007],
        [ 0.0117],
        [ 0.0010],
        [ 0.0007]])

tensor([[0.0000],
        [0.0000],
        [0.0097],
        [0.0000],
        [0.0120],
        [0.0000],
        [0.0000],
        [0.0117],
        [0.0000],
        [0.0000]])