In [11]:
import torch
from torch import nn
from tests_backpropagation import main_test

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

## Class ``MyNet``

Read carefully how ``MyNet`` is implemented in the cell below. In particular:  
- ``n_hid`` is a list of integer, representing the number of hidden units in each hidden layer.   
-  ``MyNet([2, 3, 2]) = MiniNet()`` where ``MiniNet`` is the neural network defined in the fourth tutorial, in which notations are also clarified.     
- ``model.L`` is the number of hidden layers, ``L``   
- ``model.f[l]`` is the activation function of layer ``l``, $f^{[l]}$ (here ``torch.tanh``)   
- ``model.df[l]`` is the derivative of the activation function, $f'^{[l]}$   
- ``model.a[l]``  is the tensor $A^{[l]}$, (shape: ``(1, n(l))``)   
- ``model.z[l]``  is the tensor $Z^{[l]}$, (shape: ``(1, n(l))``)  
- Weights $W^{[l]}$ (shape: ``(n(l+1), n(l))``) and biases $\mathbf{b}^{[l]}$ (shape: ``(n(l+1))``) can be accessed as follows:
```
weights = model.fc[str(l)].weight.data
bias = model.fc[str(l)].bias.data
```

In [12]:
class MyNet(nn.Module):
    def __init__(self, n_l = [2, 3, 2]):
        super().__init__() 
        
        
        # number of layers in our network (following Andrew's notations)
        self.L = len(n_l)-1
        self.n_l = n_l
        
        # Where we will store our neuron values
        # - z: before activation function 
        # - a: after activation function (a=f(z))
        self.z = {i : None for i in range(1, self.L+1)}
        self.a = {i : None for i in range(self.L+1)}

        # Where we will store the gradients for our custom backpropagation algo
        self.dL_dw = {i : None for i in range(1, self.L+1)}
        self.dL_db = {i : None for i in range(1, self.L+1)}

        # Our activation functions
        self.f = {i : lambda x : torch.tanh(x) for i in range(1, self.L+1)}

        # Derivatives of our activation functions
        self.df = {
            i : lambda x : (1 / (torch.cosh(x)**2)) 
            for i in range(1, self.L+1)
        }
        
        # fully connected layers
        # We have to use nn.ModuleDict and to use strings as keys here to 
        # respect pytorch requirements (otherwise, the model does not learn)
        self.fc = nn.ModuleDict({str(i): None for i in range(1, self.L+1)})
        for i in range(1, self.L+1):
            self.fc[str(i)] = nn.Linear(in_features=n_l[i-1], out_features=n_l[i])
        
    def forward(self, x):
        # Input layer
        self.a[0] = torch.flatten(x, 1)
        
        # Hidden layers until output layer
        for i in range(1, self.L+1):

            # fully connected layer
            self.z[i] = self.fc[str(i)](self.a[i-1])
            # activation
            self.a[i] = self.f[i](self.z[i])

        # return output
        return self.a[self.L]

## Tasks

Write a function ``backpropagation(model, y_true, y_pred)`` that computes:

- $\frac{\partial L}{\partial w^{[l]}_{i,j}}$ and store them in ``model.dL_dw[l][i,j]`` for $l \in [1 .. L]$ 
- $\frac{\partial L}{\partial b^{[l]}_{j}}$ and store them in ``model.dL_db[l][j]`` for $l \in [1 .. L]$ 

assuming ``model`` is an instance of the ``MyNet`` class.

A vectorized implementation would be appreciated.

In [13]:
def backpropagation(model, y_true, y_pred):
    print("y_true = ", y_true)
    print("y_pred = ", y_pred)

    # It might make sense to move this propert into the model class
    # That way we can change it along with the weights if we prefer.
    learning_rate = 0.01

    #
    # Compute slopes
    #

    # Each slope computation requires the slope from the next layer
    # Therefore, we compute the slope for the output layer first, then work backwards

    output_error = nn.MSELoss(reduction="none")(y_pred, y_true)
    output_slope = output_error * model.df[model.L](y_pred)

    model.dL_dw[model.L] = output_slope
    model.dL_db[model.L] = output_slope

    # Compute slopes for hidden layers
    for layer in reversed(range(1, model.L)):

        # Calculate error
        next_slope = model.dL_dw[layer + 1]
        next_out = model.a[layer + 1]
        next_out_transposed = torch.transpose(next_out, 0, 1)

        error = torch.matmul(next_slope, next_out_transposed)

        # Calculate slope
        out = model.a[layer]
        slope = error * model.df[layer](out)
        
        # Store slope
        model.dL_dw[layer] = slope
        model.dL_db[layer] = slope

    #
    # Update weights and biases
    #

    # The first layer is the input layer and does not have weights not biases.
    # Therefore the first updates layer has index 1 (0 being the first)

    return

    for layer in reversed(range(1, model.L - 1)):

        # Compute update for weights
        previous_out = model.a[layer - 1]
        previous_out_transposed = torch.transpose(previous_out, 0, 1)
        weight_slope = model.dL_dw[layer]
        weight_change = torch.matmul(previous_out_transposed, weight_slope) * learning_rate
        
        # Compute update for biases
        bias_slope = model.dL_db[layer]
        bias_change = bias_slope.sum() * learning_rate

        # Apply updates
        parameters = model.fc[str(layer)]
        parameters.weight.add_(weight_change)
        parameters.bias.add_(bias_change)

## Run the cells below, and check the output

- In the 1st cell, we use a toy dataset and the same architecture as the MiniNet class of the fourth tutorial. 
- In the 2nd cell, we use a few samples of the MNIST dataset with a consistent model architecture (``24x24`` black and white cropped images as input and ``10`` output classes). 

You can set ``verbose`` to ``True`` if you want more details about your computations versus what is expected.

In [14]:
model = MyNet([2, 3, 2])
main_test(backpropagation, model, verbose=True, data='toy')


 __________________________________________________________________ 
                          Check gradients                             
 __________________________________________________________________ 
y_true =  tensor([[0., 0.]])
y_pred =  tensor([[-0.3391,  0.4280]], grad_fn=<TanhBackward0>)


IndexError: index 1 is out of bounds for dimension 0 with size 1

In [None]:
model = MyNet([24*24, 16, 10])
main_test(backpropagation, model, verbose=True, data='mnist')

0.3%

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ../data/MNIST\raw\train-images-idx3-ubyte.gz to ../data/MNIST\raw


100.0%


Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST\raw\train-labels-idx1-ubyte.gz
Extracting ../data/MNIST\raw\train-labels-idx1-ubyte.gz to ../data/MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz



31.8%

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%
100.0%

Extracting ../data/MNIST\raw\t10k-images-idx3-ubyte.gz to ../data/MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST\raw\t10k-labels-idx1-ubyte.gz
Extracting ../data/MNIST\raw\t10k-labels-idx1-ubyte.gz to ../data/MNIST\raw







 __________________________________________________________________ 
                          Check gradients                             
 __________________________________________________________________ 
y_true =  tensor([[7.]])
y_pred =  tensor([[ 0.1090, -0.3560,  0.5218, -0.2193,  0.2857,  0.2657, -0.0566,  0.4653,
         -0.4371,  0.2910]], grad_fn=<TanhBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)


IndexError: index 16 is out of bounds for dimension 1 with size 16