In [66]:
import torch
from torch import nn
from tests_backpropagation import main_test

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

## Class ``MyNet``

Read carefully how ``MyNet`` is implemented in the cell below. In particular:  
- ``n_hid`` is a list of integer, representing the number of hidden units in each hidden layer.   
-  ``MyNet([2, 3, 2]) = MiniNet()`` where ``MiniNet`` is the neural network defined in the fourth tutorial, in which notations are also clarified.     
- ``model.L`` is the number of hidden layers, ``L``   
- ``model.f[l]`` is the activation function of layer ``l``, $f^{[l]}$ (here ``torch.tanh``)   
- ``model.df[l]`` is the derivative of the activation function, $f'^{[l]}$   
- ``model.a[l]``  is the tensor $A^{[l]}$, (shape: ``(1, n(l))``)   
- ``model.z[l]``  is the tensor $Z^{[l]}$, (shape: ``(1, n(l))``)  
- Weights $W^{[l]}$ (shape: ``(n(l+1), n(l))``) and biases $\mathbf{b}^{[l]}$ (shape: ``(n(l+1))``) can be accessed as follows:
```
weights = model.fc[str(l)].weight.data
bias = model.fc[str(l)].bias.data
```

In [67]:
class MyNet(nn.Module):
    def __init__(self, n_l = [2, 3, 2]):
        super().__init__() 
        
        
        # number of layers in our network (following Andrew's notations)
        self.L = len(n_l)-1
        self.n_l = n_l
        
        # Where we will store our neuron values
        # - z: before activation function 
        # - a: after activation function (a=f(z))
        self.z = {i : None for i in range(1, self.L+1)}
        self.a = {i : None for i in range(self.L+1)}

        # Where we will store the gradients for our custom backpropagation algo
        self.dL_dw = {i : None for i in range(1, self.L+1)}
        self.dL_db = {i : None for i in range(1, self.L+1)}

        # Our activation functions
        self.f = {i : lambda x : torch.tanh(x) for i in range(1, self.L+1)}

        # Derivatives of our activation functions
        self.df = {
            i : lambda x : (1 / (torch.cosh(x)**2)) 
            for i in range(1, self.L+1)
        }
        
        # fully connected layers
        # We have to use nn.ModuleDict and to use strings as keys here to 
        # respect pytorch requirements (otherwise, the model does not learn)
        self.fc = nn.ModuleDict({str(i): None for i in range(1, self.L+1)})
        for i in range(1, self.L+1):
            self.fc[str(i)] = nn.Linear(in_features=n_l[i-1], out_features=n_l[i])
        
    def forward(self, x):
        # Input layer
        self.a[0] = torch.flatten(x, 1)
        
        # Hidden layers until output layer
        for i in range(1, self.L+1):

            # fully connected layer
            self.z[i] = self.fc[str(i)](self.a[i-1])
            # activation
            self.a[i] = self.f[i](self.z[i])

        # return output
        return self.a[self.L]

## Tasks

Write a function ``backpropagation(model, y_true, y_pred)`` that computes:

- $\frac{\partial L}{\partial w^{[l]}_{i,j}}$ and store them in ``model.dL_dw[l][i,j]`` for $l \in [1 .. L]$ 
- $\frac{\partial L}{\partial b^{[l]}_{j}}$ and store them in ``model.dL_db[l][j]`` for $l \in [1 .. L]$ 

assuming ``model`` is an instance of the ``MyNet`` class.

A vectorized implementation would be appreciated.

In [98]:
def backpropagation(model, y_true, y_pred):
    l = model.L
    ed = -2*(y_true-y_pred)
    delta = ed*(model.df[l](model.z[l]))
    pdw = delta.T*(model.a[l-1])
    model.dL_dw[l] = pdw
    model.dL_db[l] = torch.flatten(delta)
    for layer in range(l-1, 0, -1):
        weights = model.fc[str(layer+1)].weight.data
        delta = (delta@weights)*(model.df[layer](model.z[layer]))
        pdw = delta.T*(model.a[layer-1])
        model.dL_dw[layer] = pdw
        model.dL_db[layer] = delta[0]
        print(delta)
       # print("Hello")
        #print(torch.flatten(delta))
        #print(delta[0])

    return None


## Run the cells below, and check the output

- In the 1st cell, we use a toy dataset and the same architecture as the MiniNet class of the fourth tutorial. 
- In the 2nd cell, we use a few samples of the MNIST dataset with a consistent model architecture (``24x24`` black and white cropped images as input and ``10`` output classes). 

You can set ``verbose`` to ``True`` if you want more details about your computations versus what is expected.

In [99]:
model = MyNet([2, 3, 2])
main_test(backpropagation, model, verbose=False, data='toy')


 __________________________________________________________________ 
                          Check gradients                             
 __________________________________________________________________ 
tensor([[-0.1875,  0.0510,  0.2081]], grad_fn=<MulBackward0>)
tensor([[-0.1250,  0.8498, -0.8401]], grad_fn=<MulBackward0>)
tensor([[ 0.0298,  0.0606, -0.0674]], grad_fn=<MulBackward0>)
tensor([[-0.0844, -0.1868,  0.0285]], grad_fn=<MulBackward0>)
tensor([[-0.0110, -0.0308,  0.0119]], grad_fn=<MulBackward0>)
tensor([[-0.0037, -0.0012,  0.0067]], grad_fn=<MulBackward0>)
tensor([[-0.0013,  0.0196,  0.0033]], grad_fn=<MulBackward0>)
tensor([[-0.0007,  0.0084,  0.0028]], grad_fn=<MulBackward0>)
tensor([[-6.0178e-05,  3.3598e-02,  3.9726e-04]], grad_fn=<MulBackward0>)
tensor([[-5.5591e-05,  5.8525e-02,  1.6770e-04]], grad_fn=<MulBackward0>)
tensor([[ 0.5447, -0.0538, -0.6074]], grad_fn=<MulBackward0>)
tensor([[-0.0623,  0.0092,  0.0229]], grad_fn=<MulBackward0>)
tensor([[-0.0117,  0.0

In [91]:
model = MyNet([24*24, 16, 10])
main_test(backpropagation, model, verbose=False, data='mnist')


 __________________________________________________________________ 
                          Check gradients                             
 __________________________________________________________________ 
tensor([[ 4.2234,  5.7877,  6.8000, -5.1739, -4.8914, -3.9720, -1.8875,  3.2703,
         -1.1382, -2.9207, -3.0426,  0.5908, -1.7191,  8.2706, -0.1163,  1.1097]],
       grad_fn=<MulBackward0>)
Hello
tensor([ 4.2234,  5.7877,  6.8000, -5.1739, -4.8914, -3.9720, -1.8875,  3.2703,
        -1.1382, -2.9207, -3.0426,  0.5908, -1.7191,  8.2706, -0.1163,  1.1097],
       grad_fn=<ReshapeAliasBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)


tensor([[ 5.4977e-04,  5.4455e-06, -5.0843e-06, -1.2099e-03,  3.4277e-03,
         -6.9010e-02, -4.8888e-02, -6.1763e-04, -4.2158e-01, -1.8283e-02,
          1.4199e-02, -6.5599e-01,  9.3027e-02, -4.3346e-06, -1.6800e-01,
         -4.7703e-01]], grad_fn=<MulBackward0>)
Hello
tensor([ 5.4977e-04,  5.4455e-06, -5.0843e-06, -1.2099e-03,  3.4277e-03,
        -6.9010e-02, -4.8888e-02, -6.1763e-04, -4.2158e-01, -1.8283e-02,
         1.4199e-02, -6.5599e-01,  9.3027e-02, -4.3346e-06, -1.6800e-01,
        -4.7703e-01], grad_fn=<ReshapeAliasBackward0>)
tensor([[ 6.2709e-21,  2.0907e-28, -1.0523e-31, -6.2003e-25,  2.1973e-24,
         -2.4942e-20, -7.9916e-13, -3.8823e-17, -9.1087e-11, -9.1539e-17,
          2.5487e-16, -3.0564e-06,  6.7951e-11, -1.6967e-37, -5.0003e-06,
         -3.2246e-06]], grad_fn=<MulBackward0>)
Hello
tensor([ 6.2709e-21,  2.0907e-28, -1.0523e-31, -6.2003e-25,  2.1973e-24,
        -2.4942e-20, -7.9916e-13, -3.8823e-17, -9.1087e-11, -9.1539e-17,
         2.5487e-16, -3.0564