# Set up
Run the cell below before running the rest of the cells.

In [None]:
from collections import OrderedDict

import torch
import torch.nn as nn
torch.manual_seed(2109)

from matplotlib.ticker import FormatStrFormatter
import matplotlib.pyplot as plt

# Question 1 Back Propagation
We provide the neural network architecture in PyTorch code below, as well as the loss function. Check your answers to part (c) by replacing `raise NotImplementedError()` with your answer.

Note that the matrix dimensions are flipped in PyTorch, where the first dimension is the number of training samples. This is just a difference in conventions, and will not affect much. Also note that we don't have the bias term in `X`, this is because the `nn.Linear` layer has it own bias included, and it will not affect your answers, since we are only asking for weight gradients.

In [None]:
''' 
Define the inputs and outputs, the network, and the loss function 
'''
n = 16  # Number of training samples
m_0 = 8  # Number of features for each training sample

X = torch.randn((n, m_0))
Y = torch.randint(0, 2, (n, 1), dtype=torch.float32)

neural_net = nn.Sequential(OrderedDict([
    ('lin1', nn.Linear(m_0, 1)),
    ('lin2', nn.Linear(1, 1)),  # Dummy layer so that we can illustrate part (c)
    ('sig', nn.Sigmoid())
]))

# Set dummy layer's weight to 1 to prevent it from affecting the first layer's gradients
with torch.no_grad():
    neural_net.lin2.weight[0, 0] = 1

loss = nn.BCELoss()

In [None]:
'''
Obtain per-sample gradients for our dummy layer, divided by the output of `lin1`, to get the
derivative of loss w.r.t. f.

Not a common operation, but needed to illustrate part (c) of the tutorial.

See https://pytorch.org/functorch/stable/notebooks/per_sample_grads.html for more details 
about per-sample gradients and how to implement an optimized version of calculating them
using functorch, if you're interested.
'''
per_sample_gradients = torch.zeros((n, 1))
lin1_output = torch.zeros((n, 1))

for i in range(n):
    y_hat = neural_net(X[i])
    per_sample_gradients[i] = torch.autograd.grad(loss(y_hat, Y[i]), neural_net.lin2.weight)[0]
    lin1_output[i] = neural_net.lin1(X[i])

dLoss_df = 1 / n * per_sample_gradients / lin1_output

In [None]:
''' 
Predict Y using our network, and calculate the loss of our prediction.

Note that the gradients will automatically be calculated by PyTorch once you invoke the loss(...).backward() function.
'''
neural_net.zero_grad()

Y_hat = neural_net(X)

loss(Y_hat, Y).backward()

In [None]:
def part_c(Y, Y_hat):
    return 1 / n * (Y_hat - Y)

assert torch.allclose(part_c(Y, Y_hat), dLoss_df)

# Question 3 Potential Issues with Training Deep Neural Networks
Run/Study the code below, and answer 3(a) and 3(b) of the tutorial.

In [None]:
'''
Create our 50-layer model.
'''

layer_dict = OrderedDict()
for i in range(50):
    layer_dict['lin{}'.format(i+1)] = nn.Linear(10, 10 if i < 49 else 5)
    layer_dict['act{}'.format(i+1)] = nn.ReLU() # previously sigmoid

deep_neural_net = nn.Sequential(layer_dict)

deep_X = torch.randn(50, 10)
deep_Y = torch.randn(50, 5)

deep_Y_hat = deep_neural_net(deep_X)

deep_loss = nn.L1Loss()
deep_loss(deep_Y_hat, deep_Y).backward()

In [None]:
'''
Visualize the max gradient magnitude for the first 10 layers. Note the y-axis scale for
each plot you generate.

Feel free to play around with the method of visualization here.
'''

max_grad_magnitude_per_layer = []
for name, layer in deep_neural_net.named_modules():
    if isinstance(layer, nn.Linear) and int(name[3:]) < 10:
        max_grad_magnitude_per_layer.append(torch.max(torch.abs(layer.weight.grad)))

plt.plot(max_grad_magnitude_per_layer)
plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%.1e'))
plt.show()