In [1]:
# Edited by: Kok Teng Ng (1936360), Minjeong Lee (1978925)
# IE 678 Deep Learning, University of Mannheim
# Author: Rainer Gemulla

In [2]:
import math
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

# import helper functions
import sys, os

sys.path.append(os.getcwd())
from a01helper import *  # check out the helper functions there, if you like

# Task 1: Implement an MLP

## 1a Logistic Regression

In [3]:
# nn.Module is the superclass of all PyTorch models.
class LogisticRegression(nn.Module):
    """A logistic regression model.

    Parameters
    ----------
    D number of inputs
    C number of classes
    """

    # the definition of all parameters the model uses happens here, i.e., during
    # initialization
    def __init__(self, D, C):
        super(LogisticRegression, self).__init__()

        # Create and initialize model parameters. For (multinomial) logistic regression,
        # we have a DxC-dimensional weight matrix W and a C-dimensional bias b.
        self.W = torch.randn(D, C) / math.sqrt(D)
        self.b = torch.randn(C) / math.sqrt(C)

        # Model parameters must be registered to PyTorch as follows. Here we provide
        # a useful name that helps to access/analyze the model later on.
        self.register_parameter("0_weight", nn.Parameter(self.W))
        self.register_parameter("0_bias", nn.Parameter(self.b))

    # the forward function computes the model output for the provided (for this
    # assignent: single) input
    def forward(self, x):
        eta = self.W.t() @ x + self.b
        logprob = F.log_softmax(eta, dim=-1)
        return logprob

In [4]:
# let's test it
logreg = LogisticRegression(3, 2)
x = torch.rand(3)  # input
logreg(x)  # output (log probabilities)
logreg(x).exp()  # output (probabilities)

tensor([0.1147, 0.8853])

In [5]:
# you can access individual parameters as follows
logreg.get_parameter("0_bias")

Parameter containing:
tensor([-0.7390,  1.0503], requires_grad=True)

In [6]:
# or all of them at once
list(logreg.named_parameters())

[('0_weight',
  Parameter containing:
  tensor([[-0.6024,  0.4440],
          [-0.7520, -0.7248],
          [-0.1647, -0.2000]], requires_grad=True)),
 ('0_bias',
  Parameter containing:
  tensor([-0.7390,  1.0503], requires_grad=True))]

In [7]:
# or directly the tensors stored in the parameters
for par, value in logreg.state_dict().items():
    print(f"{par:<15}= {value}")

0_weight       = tensor([[-0.6024,  0.4440],
        [-0.7520, -0.7248],
        [-0.1647, -0.2000]])
0_bias         = tensor([-0.7390,  1.0503])


## 1b MLP

In [8]:
class MLP(nn.Module):
    """A fully-connected MLP.

    Parameters
    ----------

    sizes Contains the layer sizes. The first entry is the number of inputs, the last
    entry the number of outputs. All entries in between correspond to the number of
    units in the respective hidden layer. E.g., [2,5,7,1] means: 2 inputs -> 5D hidden
    layer -> 7D hidden layer -> 1 output.

    phi Activation function used in every hidden layer (the output layer is linear).

    """
    def __init__(self, sizes: list[int], phi=F.sigmoid):
        super().__init__()

        # let's remember the specification in this model
        self.sizes = sizes
        self.phi = phi

        # Initialize and register the parameters. Follow the naming scheme used for
        # logistic regression above, i.e., the layer-i weights should be named "i_weight" and
        # "i_bias".
        
        for i in range(1, len(sizes)):
            self.register_parameter(f'{i - 1}_weight', torch.nn.Parameter(torch.randn(sizes[i - 1], sizes[i]) / math.sqrt(sizes[i - 1])))
            self.register_parameter(f'{i - 1}_bias', torch.nn.Parameter(torch.randn(sizes[i]) / math.sqrt(sizes[i])))

    def num_layers(self):
        """Number of layers (excluding input layer)"""
        return len(self.sizes) - 1

    def forward(self, x):
        for i in range(0, self.num_layers() - 1):
            weight = getattr(self, f"{i}_weight")
            bias = getattr(self, f"{i}_bias")
            x = weight.t() @ x + bias
            x = self.phi(x)
        weight = getattr(self, f"{self.num_layers() - 1}_weight")
        bias = getattr(self, f"{self.num_layers() - 1}_bias")
        x = weight.t() @ x + bias
        return x

In [9]:
# here you should see the correct parameter sizes
mlp = MLP([2, 3, 4, 2], torch.relu)
list(mlp.named_parameters())

[('0_weight',
  Parameter containing:
  tensor([[-1.3091, -0.3019,  0.3660],
          [-0.6868,  0.4673,  0.7127]], requires_grad=True)),
 ('0_bias',
  Parameter containing:
  tensor([-0.2805, -0.2305,  0.4078], requires_grad=True)),
 ('1_weight',
  Parameter containing:
  tensor([[ 0.1205,  0.5311, -0.8926, -0.2237],
          [-0.5769,  0.9532, -0.9783,  0.0031],
          [ 0.3102, -0.4063,  0.1198,  0.9896]], requires_grad=True)),
 ('1_bias',
  Parameter containing:
  tensor([ 0.2445, -0.2473,  0.4531, -0.4029], requires_grad=True)),
 ('2_weight',
  Parameter containing:
  tensor([[-0.0019,  0.2213],
          [-0.2815,  1.1297],
          [-0.1857,  0.5654],
          [-0.4670, -0.4101]], requires_grad=True)),
 ('2_bias',
  Parameter containing:
  tensor([-0.8514, -0.0948], requires_grad=True))]

In [10]:
# Test your code; we fix the parameters and check the result
with torch.no_grad():
    torch.manual_seed(0)
    for l in range(mlp.num_layers()):
        W, b = mlp.get_parameter(f"{l}_weight"), mlp.get_parameter(f"{l}_bias")
        W[:] = torch.randn(W.shape)
        b[:] = torch.randn(b.shape)

mlp(torch.tensor([-1.0, 2.0]))  # must give: [ 0.8315, -3.6792]

tensor([ 0.8315, -3.6792], grad_fn=<AddBackward0>)

In [11]:
# You can also evaluate your model on multiple inputs at once. Here "torch.func.vmap"
# produces a function that applies the provided function (mlp#forward) to each row of
# its argument (torch.tensor...).
#
# [[ 0.8315, -3.6792],
# [ 4.8448, -6.8813]]
torch.func.vmap(mlp)(torch.tensor([[-1.0, 2.0], [1.0, -2.0]]))

tensor([[ 0.8315, -3.6792],
        [ 4.8448, -6.8813]], grad_fn=<AddBackward0>)

## 1c Batching

In [12]:
class MLP(nn.Module):
    def __init__(self, sizes: list[int], phi=F.sigmoid):
        super().__init__()

        self.sizes = sizes
        self.phi = phi
        
        for i in range(1, len(sizes)):
            self.register_parameter(f'{i - 1}_weight', torch.nn.Parameter(torch.randn(sizes[i - 1], sizes[i]) / math.sqrt(sizes[i - 1])))
            self.register_parameter(f'{i - 1}_bias', torch.nn.Parameter(torch.randn(sizes[i]) / math.sqrt(sizes[i])))

    def num_layers(self):
        return len(self.sizes) - 1

    def forward(self, x):
        if x.dim == 1:
            x = x.unsqueeze(0)
        for i in range(0, self.num_layers() - 1):
            weight = getattr(self, f"{i}_weight")
            bias = getattr(self, f"{i}_bias")
            x = x @ weight + bias.unsqueeze(0)
            x = self.phi(x)
        weight = getattr(self, f"{self.num_layers() - 1}_weight")
        bias = getattr(self, f"{self.num_layers() - 1}_bias")
        x = x @ weight + bias.unsqueeze(0)
        return x

In [13]:
# here you should see the correct parameter sizes
mlp = MLP([2, 3, 4, 2], torch.relu)

In [14]:
# Test your code; we fix the parameters and check the result
with torch.no_grad():
    torch.manual_seed(0)
    for l in range(mlp.num_layers()):
        W, b = mlp.get_parameter(f"{l}_weight"), mlp.get_parameter(f"{l}_bias")
        W[:] = torch.randn(W.shape)
        b[:] = torch.randn(b.shape)

In [15]:
# After you adapted the MLP class, you should get the same results as above.
mlp(torch.tensor([-1.0, 2.0]))  # must give: [ 0.8315, -3.6792]

tensor([[ 0.8315, -3.6792]], grad_fn=<AddBackward0>)

In [16]:
# Now without vmap. Only proceed to task 2 once this works correctly.
#
# [[ 0.8315, -3.6792],
# [ 4.8448, -6.8813]]
mlp(torch.tensor([[-1.0, 2.0], [1.0, -2.0]]))

tensor([[ 0.8315, -3.6792],
        [ 4.8448, -6.8813]], grad_fn=<AddBackward0>)

# 2 Multi-Layer Feed-Forward Neural Networks

## 2a Conjecture how an FNN fit will look like

In [17]:
# here is the one-dimensional dataset that we will use
nextplot()
plot1(X1, y1, label="train")
plot1(X1test, y1test, label="test")
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x2a061bc40>

## 2b Train with 2 hidden units

In [18]:
# Training code. You do not need to modify this code.
train_bfgs = lambda model, **kwargs: train_scipy(X1, y1, model, **kwargs)

def train1(hidden_sizes, nreps=10, phi=F.sigmoid, train=train_bfgs, **kwargs):
    
    """Train an FNN.

    hidden_sizes is a (possibly empty) list containing the sizes of the hidden layer(s).
    nreps refers to the number of repetitions.

    """
    
    best_model = None
    best_cost = math.inf
    for rep in range(nreps):
        model = MLP([1] + hidden_sizes + [1], phi)  # that's your model!
        print(f"X1 shape: {X1.shape}")
        print(f"Repetition {rep: 2d}: ", end="")
        model = train(model, **kwargs)
        mse = F.mse_loss(y1, model(X1)).item()
        if mse < best_cost:
            best_model = model
            best_cost = mse
        print(f"best_cost={best_cost:.3f}")

    return best_model

In [19]:
# Let's fit the model with one hidden layer consisting of 2 units.
model = train1([2], nreps=1)
print("Training error:", F.mse_loss(y1, model(X1)).item())
print("Test error    :", F.mse_loss(y1test, model(X1test)).item())

X1 shape: torch.Size([100, 1])
Repetition  0: Optimization terminated successfully.
         Current function value: 0.293673
         Iterations: 120
         Function evaluations: 240
         Gradient evaluations: 233
best_cost=0.294
Training error: 0.29367321729660034
Test error    : 0.30551230907440186


In [20]:
# plot the data and the fit
nextplot()
plot1(X1, y1, label="train")
plot1(X1test, y1test, label="test")
plot1fit(torch.linspace(0, 13, 500).unsqueeze(1), model)

<IPython.core.display.Javascript object>

In [21]:
# The weight matrices and bias vectors can be read out as follows. If you want, use
# these parameters to compute the output of the network (on X1) directly and compare to
# vmap(model)(X1).
for par, value in model.state_dict().items():
    print(f"{par:<15}= {value}")

0_weight       = tensor([[75.7091,  9.2566]])
0_bias         = tensor([-231.3285,  -54.1847])
1_weight       = tensor([[-1.3810],
        [ 0.7480]])
1_bias         = tensor([0.6471])


In [22]:
# now repeat this multiple times
for i in range (0, 5):
    model = train1([2], nreps=1)
    print("Training error:", F.mse_loss(y1, model(X1)).item())
    print("Test error    :", F.mse_loss(y1test, model(X1test)).item())
    nextplot()
    plot1(X1, y1, label="train")
    plot1(X1test, y1test, label="test")
    plot1fit(torch.linspace(0, 13, 500).unsqueeze(1), model)

X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.079572
         Iterations: 387
         Function evaluations: 521
         Gradient evaluations: 511
best_cost=0.080
Training error: 0.07957355678081512
Test error    : 0.08671201020479202


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


<IPython.core.display.Javascript object>

X1 shape: torch.Size([100, 1])
Repetition  0: Optimization terminated successfully.
         Current function value: 0.303902
         Iterations: 143
         Function evaluations: 155
         Gradient evaluations: 155
best_cost=0.304
Training error: 0.30390220880508423
Test error    : 0.3037970960140228


<IPython.core.display.Javascript object>

X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.079573
         Iterations: 400
         Function evaluations: 562
         Gradient evaluations: 551
best_cost=0.080
Training error: 0.07957581430673599
Test error    : 0.0867120772600174


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


<IPython.core.display.Javascript object>

X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.286909
         Iterations: 302
         Function evaluations: 458
         Gradient evaluations: 445
best_cost=0.287
Training error: 0.28690817952156067
Test error    : 0.29484879970550537


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


<IPython.core.display.Javascript object>

X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.079573
         Iterations: 336
         Function evaluations: 470
         Gradient evaluations: 458
best_cost=0.080
Training error: 0.07956987619400024
Test error    : 0.0867103710770607


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


<IPython.core.display.Javascript object>

In [23]:
# From now on, always train multiple times (nreps=10 by default) and report best model.
model = train1([2], nreps=10)

print("Training error:", F.mse_loss(y1, model(X1)).item())
print("Test error    :", F.mse_loss(y1test, model(X1test)).item())

X1 shape: torch.Size([100, 1])
Repetition  0: Optimization terminated successfully.
         Current function value: 0.357250
         Iterations: 66
         Function evaluations: 76
         Gradient evaluations: 76
best_cost=0.357
X1 shape: torch.Size([100, 1])
Repetition  1: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.438546
         Iterations: 278
         Function evaluations: 398
         Gradient evaluations: 386
best_cost=0.357
X1 shape: torch.Size([100, 1])
Repetition  2: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.286909
         Iterations: 448
         Function evaluations: 577
         Gradient evaluations: 565
best_cost=0.287
X1 shape: torch.Size([100, 1])
Repetition  3: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.079573
         Iterations: 310
         Function evaluations: 459
         Gradient evaluations: 449
best_cost=0.080
X1 shape: torch.Size([100, 1])
Repetition  4: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.286909
         Iterations: 307
         Function evaluations: 413
         Gradient evaluations: 401
best_cost=0.080
X1 shape: torch.Size([100, 1])
Repetition  5: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.286909
         Iterations: 534
         Function evaluations: 690
         Gradient evaluations: 681
best_cost=0.080
X1 shape: torch.Size([100, 1])
Repetition  6: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.079573
         Iterations: 330
         Function evaluations: 470
         Gradient evaluations: 458
best_cost=0.080
X1 shape: torch.Size([100, 1])
Repetition  7: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.079573
         Iterations: 381
         Function evaluations: 537
         Gradient evaluations: 524
best_cost=0.080
X1 shape: torch.Size([100, 1])
Repetition  8: Optimization terminated successfully.
         Current function value: 0.357250
         Iterations: 78
         Function evaluations: 80
         Gradient evaluations: 80
best_cost=0.080
X1 shape: torch.Size([100, 1])
Repetition  9:          Current function value: 0.286909
         Iterations: 324
         Function evaluations: 546
         Gradient evaluations: 533
best_cost=0.080
Training error: 0.07957330346107483
Test error    : 0.0867152065038681


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


In [24]:
# plot the data and the fit
nextplot()
plot1(X1, y1, label="train")
plot1(X1test, y1test, label="test")
plot1fit(torch.linspace(0, 13, 500).unsqueeze(1), model)

<IPython.core.display.Javascript object>

## 2c Width

In [25]:
# Experiment with different hidden layer sizes. To avoid recomputing
# models, you may want to save your models using torch.save(model, filename) and
# load them again using torch.load(filename).

for i in [1, 2, 3, 10, 50, 100]:
    model = train1([i], nreps = 1)
    print("Training error:", F.mse_loss(y1, model(X1)).item())
    print("Test error    :", F.mse_loss(y1test, model(X1test)).item())
    torch.save(model, f"model_{i}.pth")

X1 shape: torch.Size([100, 1])
Repetition  0: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


Optimization terminated successfully.
         Current function value: 0.372919
         Iterations: 42
         Function evaluations: 47
         Gradient evaluations: 47
best_cost=0.373
Training error: 0.3729189336299896
Test error    : 0.3743167221546173
X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.079573
         Iterations: 390
         Function evaluations: 593
         Gradient evaluations: 583
best_cost=0.080
Training error: 0.07957376539707184
Test error    : 0.08670931309461594
X1 shape: torch.Size([100, 1])
Repetition  0: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.049892
         Iterations: 289
         Function evaluations: 469
         Gradient evaluations: 456
best_cost=0.050
Training error: 0.04989229515194893
Test error    : 0.0598050132393837
X1 shape: torch.Size([100, 1])
Repetition  0: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.005721
         Iterations: 1026
         Function evaluations: 1250
         Gradient evaluations: 1234
best_cost=0.006
Training error: 0.0057213036343455315
Test error    : 0.0150537034496665
X1 shape: torch.Size([100, 1])
Repetition  0: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.003007
         Iterations: 5655
         Function evaluations: 6282
         Gradient evaluations: 6268
best_cost=0.003
Training error: 0.0030068017076700926
Test error    : 2.0873610973358154
X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.001826
         Iterations: 8126
         Function evaluations: 8804
         Gradient evaluations: 8792
best_cost=0.002
Training error: 0.001825749408453703
Test error    : 4.303783893585205


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


## 2d Distributed representations

In [26]:
# train a model to analyze
model = train1([10])

X1 shape: torch.Size([100, 1])
Repetition  0: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006421
         Iterations: 2055
         Function evaluations: 2361
         Gradient evaluations: 2350
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  1: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006433
         Iterations: 3216
         Function evaluations: 3601
         Gradient evaluations: 3591
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  2: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006060
         Iterations: 3155
         Function evaluations: 3528
         Gradient evaluations: 3516
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  3: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006795
         Iterations: 1382
         Function evaluations: 1558
         Gradient evaluations: 1546
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  4: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006665
         Iterations: 665
         Function evaluations: 809
         Gradient evaluations: 801
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  5: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006329
         Iterations: 1429
         Function evaluations: 1643
         Gradient evaluations: 1631
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  6: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006657
         Iterations: 1853
         Function evaluations: 2153
         Gradient evaluations: 2141
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  7: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006597
         Iterations: 1848
         Function evaluations: 2086
         Gradient evaluations: 2075
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  8: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.006223
         Iterations: 2293
         Function evaluations: 2681
         Gradient evaluations: 2669
best_cost=0.006
X1 shape: torch.Size([100, 1])
Repetition  9:          Current function value: 0.037453
         Iterations: 641
         Function evaluations: 788
         Gradient evaluations: 778
best_cost=0.006


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


In [27]:
# plot the fit as well as the outputs of each neuron in the hidden
# layer (scale for the latter is shown on right y-axis)
nextplot()
plot1(X1, y1, label="train")
plot1(X1test, y1test, label="test")
plot1fit(torch.linspace(0, 13, 500).unsqueeze(1), model, hidden=True, scale=False)

<IPython.core.display.Javascript object>

In [28]:
# plot the fit as well as the outputs of each neuron in the hidden layer, scaled
# by its weight for the output neuron (scale for the latter is shown on right
# y-axis)

nextplot()
plot1(X1, y1, label="train")
plot1(X1test, y1test, label="test")
plot1fit(torch.linspace(0, 13, 500).unsqueeze(1), model, hidden=True, scale=True)

<IPython.core.display.Javascript object>

## 2e Experiment with different optimizers (optional)

In [29]:
# PyTorch provides many gradient-based optimizers; see
# https://pytorch.org/docs/stable/optim.html. You can use a PyTorch optimizer
# as follows.
train_adam = lambda model, **kwargs: fnn_train(
    X1, y1, model, optimizer=torch.optim.Adam(model.parameters(), lr=0.01), **kwargs
)
model = train1([50], nreps=1, train=train_adam, max_epochs=5000, tol=1e-8, verbose=True)

X1 shape: torch.Size([100, 1])
Repetition  0: Epoch     0: cost=   2.496 
Epoch     1: cost=   1.822 
Epoch     2: cost=   1.293 
Epoch     3: cost=   0.907 
Epoch     4: cost=   0.661 
Epoch     5: cost=   0.539 
Epoch     6: cost=   0.516 
Epoch     7: cost=   0.560 
Epoch     8: cost=   0.634 
Epoch     9: cost=   0.706 
Epoch    10: cost=   0.752 
Epoch    11: cost=   0.764 
Epoch    12: cost=   0.742 
Epoch    13: cost=   0.695 
Epoch    14: cost=   0.635 
Epoch    15: cost=   0.572 
Epoch    16: cost=   0.516 
Epoch    17: cost=   0.474 
Epoch    18: cost=   0.449 
Epoch    19: cost=   0.440 
Epoch    20: cost=   0.445 
Epoch    21: cost=   0.459 
Epoch    22: cost=   0.476 
Epoch    23: cost=   0.492 
Epoch    24: cost=   0.502 
Epoch    25: cost=   0.505 
Epoch    26: cost=   0.500 
Epoch    27: cost=   0.489 
Epoch    28: cost=   0.475 
Epoch    29: cost=   0.461 
Epoch    30: cost=   0.448 
Epoch    31: cost=   0.439 
Epoch    32: cost=   0.434 
Epoch    33: cost=   0.434 
Ep

In [30]:
# Experiment with different number of layers and activation functions. Here is
# an example with three hidden layers (of sizes 4, 5, and 6) and ReLU activations.
#
# You can also plot the outputs of the hidden neurons in the first layer (using
# the same code above).
model = train1([4, 5, 6], nreps=50, phi=F.relu)
nextplot()
plot1(X1, y1, label="train")
plot1(X1test, y1test, label="test")
plot1fit(torch.linspace(0, 13, 500).unsqueeze(1), model)
print("Training error:", F.mse_loss(y1, model(X1)).item())
print("Test error    :", F.mse_loss(y1test, model(X1test)).item())

X1 shape: torch.Size([100, 1])
Repetition  0: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.356039
         Iterations: 50
         Function evaluations: 132
         Gradient evaluations: 127
best_cost=0.356
X1 shape: torch.Size([100, 1])
Repetition  1: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.435342
         Iterations: 27
         Function evaluations: 109
         Gradient evaluations: 104
best_cost=0.356
X1 shape: torch.Size([100, 1])
Repetition  2: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.028281
         Iterations: 89
         Function evaluations: 213
         Gradient evaluations: 208
best_cost=0.028
X1 shape: torch.Size([100, 1])
Repetition  3: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.280378
         Iterations: 60
         Function evaluations: 151
         Gradient evaluations: 144
best_cost=0.028
X1 shape: torch.Size([100, 1])
Repetition  4: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.356727
         Iterations: 75
         Function evaluations: 179
         Gradient evaluations: 175
best_cost=0.028
X1 shape: torch.Size([100, 1])
Repetition  5:          Current function value: 0.372702
         Iterations: 32
         Function evaluations: 141
         Gradient evaluations: 135
best_cost=0.028
X1 shape: torch.Size([100, 1])
Repetition  6: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.008260
         Iterations: 108
         Function evaluations: 206
         Gradient evaluations: 199
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  7: Optimization terminated successfully.
         Current function value: 0.356613
         Iterations: 70
         Function evaluations: 85
         Gradient evaluations: 85
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  8: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.356214
         Iterations: 55
         Function evaluations: 146
         Gradient evaluations: 141
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  9: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.355606
         Iterations: 88
         Function evaluations: 201
         Gradient evaluations: 194
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  10:          Current function value: 0.356431
         Iterations: 51
         Function evaluations: 159
         Gradient evaluations: 154
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  11: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.083823
         Iterations: 83
         Function evaluations: 226
         Gradient evaluations: 213
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  12: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.355588
         Iterations: 77
         Function evaluations: 156
         Gradient evaluations: 152
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  13:          Current function value: 0.356019
         Iterations: 96
         Function evaluations: 214
         Gradient evaluations: 209
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  14: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.101532
         Iterations: 48
         Function evaluations: 136
         Gradient evaluations: 125
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  15: Optimization terminated successfully.
         Current function value: 0.506238
         Iterations: 2
         Function evaluations: 3
         Gradient evaluations: 3
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  16:          Current function value: 0.434990
         Iterations: 42
         Function evaluations: 126
         Gradient evaluations: 123
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  17: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.356286
         Iterations: 46
         Function evaluations: 142
         Gradient evaluations: 135
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  18:          Current function value: 0.357715
         Iterations: 35
         Function evaluations: 143
         Gradient evaluations: 137
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  19: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.011217
         Iterations: 116
         Function evaluations: 215
         Gradient evaluations: 209
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  20:          Current function value: 0.082261
         Iterations: 60
         Function evaluations: 151
         Gradient evaluations: 140
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  21: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.355553
         Iterations: 140
         Function evaluations: 278
         Gradient evaluations: 273
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  22:          Current function value: 0.355777
         Iterations: 84
         Function evaluations: 194
         Gradient evaluations: 187
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  23: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.083823
         Iterations: 98
         Function evaluations: 218
         Gradient evaluations: 208
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  24: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.085119
         Iterations: 105
         Function evaluations: 247
         Gradient evaluations: 241
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  25: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.083775
         Iterations: 119
         Function evaluations: 215
         Gradient evaluations: 208
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  26:          Current function value: 0.356491
         Iterations: 26
         Function evaluations: 118
         Gradient evaluations: 113
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  27: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.084815
         Iterations: 126
         Function evaluations: 216
         Gradient evaluations: 208
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  28: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.269289
         Iterations: 113
         Function evaluations: 231
         Gradient evaluations: 225
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  29:          Current function value: 0.356274
         Iterations: 47
         Function evaluations: 134
         Gradient evaluations: 128
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  30: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.084264
         Iterations: 91
         Function evaluations: 268
         Gradient evaluations: 255
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  31: Optimization terminated successfully.
         Current function value: 0.506238
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  32:          Current function value: 0.086458
         Iterations: 61
         Function evaluations: 148
         Gradient evaluations: 141
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  33: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.355616
         Iterations: 58
         Function evaluations: 161
         Gradient evaluations: 158
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  34: Optimization terminated successfully.
         Current function value: 0.506238
         Iterations: 10
         Function evaluations: 12
         Gradient evaluations: 12
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  35: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.268271
         Iterations: 126
         Function evaluations: 252
         Gradient evaluations: 248
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  36:          Current function value: 0.505581
         Iterations: 10
         Function evaluations: 90
         Gradient evaluations: 86
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  37:          Current function value: 0.357703
         Iterations: 47
         Function evaluations: 141
         Gradient evaluations: 135
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  38: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.356287
         Iterations: 43
         Function evaluations: 160
         Gradient evaluations: 155
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  39: Optimization terminated successfully.
         Current function value: 0.262251
         Iterations: 96
         Function evaluations: 121
         Gradient evaluations: 121
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  40: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.355400
         Iterations: 137
         Function evaluations: 275
         Gradient evaluations: 268
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  41: Optimization terminated successfully.
         Current function value: 0.506238
         Iterations: 7
         Function evaluations: 12
         Gradient evaluations: 12
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  42: Optimization terminated successfully.
         Current function value: 0.506238
         Iterations: 4
         Function evaluations: 6
         Gradient evaluations: 6
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  43:          Current function value: 0.356313
         Iterations: 46
         Function evaluations: 154
         Gradient evaluations: 148
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  44: Optimization terminated successfully.
         Current function value: 0.372736
         Iterations: 19
         Function evaluations: 

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)
  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 0.012137
         Iterations: 99
         Function evaluations: 198
         Gradient evaluations: 191
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  48:          Current function value: 0.356743
         Iterations: 88
         Function evaluations: 188
         Gradient evaluations: 177
best_cost=0.008
X1 shape: torch.Size([100, 1])
Repetition  49:          Current function value: 0.435608
         Iterations: 20
         Function evaluations: 99
         Gradient evaluations: 92
best_cost=0.008


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


<IPython.core.display.Javascript object>

Training error: 0.008260061964392662
Test error    : 0.01167991105467081


# 3 Backpropagation

In [31]:
# Let's fit the model with one hidden layer consisting of 50 units.
model = train1([50], nreps=1)
print("Training error:", F.mse_loss(y1, model(X1)).item())
print("Test error    :", F.mse_loss(y1test, model(X1test)).item())

# Extract parameters
pars = dict(model.named_parameters())
W1 = pars["0_weight"].data  # 1x50
b1 = pars["0_bias"].data  # 50
W2 = pars["1_weight"].data  # 50x1
b2 = pars["1_bias"].data  # 1

X1 shape: torch.Size([100, 1])
Repetition  0:          Current function value: 0.003187
         Iterations: 6962
         Function evaluations: 7514
         Gradient evaluations: 7502
best_cost=0.003
Training error: 0.003186655230820179
Test error    : 2.152829170227051


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


## 3a Forward pass

In [32]:
# Compute results of forward pass on an example x (i.e., z1, z2, z3, z4, yhat, l) using Pytorch
x = X1test[1, :]
y = y1test[1, :]
print(f"x={x}, y={y}, yhat={model(x).detach()}, l={torch.nn.MSELoss()(y,model(x))}")

x=tensor([0.1030]), y=tensor([0.2253]), yhat=tensor([[7.2772]]), l=49.730159759521484


  return F.mse_loss(input, target, reduction=self.reduction)


In [33]:
# Now do this by hand (including all intermediate values). You should get the same
# results as above.
z1 = W1.t() * x
z2 = z1 + b1.unsqueeze(1)
z3 = 1/ (1 + torch.exp(-1 * z2))
z4 = torch.zeros(1, 1)
for i in range(W2.shape[0]):
    z4 += W2[i] * z3[i]
y_hat = z4 + b2
l = (y - y_hat)**2
print(f"x={x}, y={y}, yhat={y_hat}, l={l}")

x=tensor([0.1030]), y=tensor([0.2253]), yhat=tensor([[7.2772]]), l=tensor([[49.7303]])


## 3b Backward pass

In [34]:
# Compute results of backward pass on example output (i.e., delta_x, delta_W1, delta_z1,
# delta_b1, delta_z2, delta_z3, delta_W2, delta_z4, delta_b2, delta_yhat, delta_l, delta_y)

delta_l = 1
delta_y = (2 * (y - y_hat)).squeeze(1)
delta_yhat = -2 * (y - y_hat)
delta_b2 = (delta_yhat * 1).squeeze(1)
delta_z4 = delta_yhat * 1
delta_W2 = delta_z4 * z3
delta_z3 = delta_z4 * W2
delta_z2 = delta_z3 * z3 * (1 - z3)
delta_b1 = delta_z2.squeeze(1)
delta_z1 = delta_z2

delta_x = torch.zeros(1, 1)
for i in range(W1.shape[0]):
    delta_x += delta_z1[i] * W1[0][i]
delta_x = delta_x.squeeze(1)

delta_W1 = (delta_z1 * x).t()

In [35]:
# Use PyTorch's backprop
x.requires_grad = True
y.requires_grad = True
if x.grad is not None:
    x.grad.zero_()
if y.grad is not None:
    y.grad.zero_()
model.zero_grad()
t_yhat = model(x)
t_yhat.retain_grad()
t_l = torch.nn.MSELoss()(t_yhat, y)
t_l.backward()
t_delta_l = 1
t_delta_y = y.grad
t_delta_yhat = t_yhat.grad
t_delta_b2 = model.get_parameter("1_bias").grad
t_delta_W2 = model.get_parameter("1_weight").grad
t_delta_b1 = model.get_parameter("0_bias").grad
t_delta_W1 = model.get_parameter("0_weight").grad
t_delta_x = x.grad

  return F.mse_loss(input, target, reduction=self.reduction)


In [36]:
# Check if equal (show squared error)
for v in ["y", "yhat", "b2", "W2", "b1", "W1", "x"]:
    print(f'{v}, squared error={torch.sum((eval("t_delta_"+v)-eval("delta_"+v))**2)}')

y, squared error=3.637978807091713e-10
yhat, squared error=3.637978807091713e-10
b2, squared error=3.637978807091713e-10
W2, squared error=6.694076848390296e-09
b1, squared error=2.70754213715918e-07
W1, squared error=2.850990554748023e-09
x, squared error=3580073.25


In [37]:
# Check if equal (show actual values)
for v in ["l", "y", "yhat", "b2", "W2", "b1", "W1", "x"]:
    print(f'{v}, pytorch={eval("t_delta_"+v)}, you={eval("delta_"+v)}')

l, pytorch=1, you=1
y, pytorch=tensor([-14.1039]), you=tensor([-14.1039])
yhat, pytorch=tensor([[14.1039]]), you=tensor([[14.1039]])
b2, pytorch=tensor([14.1039]), you=tensor([14.1039])
W2, pytorch=tensor([[1.4061e+01],
        [6.7270e-02],
        [1.4104e+01],
        [8.1541e-05],
        [1.4313e+00],
        [2.9957e-03],
        [1.4104e+01],
        [1.2231e+01],
        [5.8818e-06],
        [1.3694e-02],
        [1.4100e+01],
        [5.4270e+00],
        [8.7531e-20],
        [2.8354e-26],
        [1.0427e+00],
        [1.4104e+01],
        [1.4104e+01],
        [1.3482e+01],
        [1.4104e+01],
        [2.9687e-03],
        [2.5167e-05],
        [1.4093e+01],
        [1.2913e-05],
        [7.8722e-05],
        [2.0585e-03],
        [2.9726e+00],
        [7.9180e+00],
        [9.7693e-04],
        [2.4010e-02],
        [6.3007e-06],
        [7.1367e-03],
        [1.4104e+01],
        [1.4103e+01],
        [1.5088e-05],
        [3.3499e-13],
        [1.3002e-04],
        [1