We start with our usual imports and figure adjustments.

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader, TensorDataset
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import math
from timeit import default_timer as timer
from functools import partial

plt.rcParams['figure.figsize'] = (12.0, 8.0)
plt.rcParams['font.size'] = 16

In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


Then we load CIFAR10, and we create the usual `Dataset`s and `DataLoader`s.

In [3]:
tsfms = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda z: z.reshape(-1))])
train_ds = torchvision.datasets.CIFAR10(root="/data/", train=True, transform=tsfms, download=True)
test_ds = torchvision.datasets.CIFAR10(root="/data/", train=False, transform=tsfms)

classes = train_ds.classes
n_classes = len(classes)
n_features = len(train_ds[0][0])

Files already downloaded and verified


In [4]:
splitted_datasets = torch.utils.data.random_split(train_ds, [45000, 5000])
actual_train_subds = splitted_datasets[0]
valid_subds = splitted_datasets[1]

In [5]:
small_actual_train_subds = torch.utils.data.Subset(actual_train_subds, range(500))
small_valid_subds = torch.utils.data.Subset(valid_subds, range(100))
small_test_subds = torch.utils.data.Subset(test_ds, range(100))

In [6]:
batch_size = 256
small_actual_train_dl = torch.utils.data.DataLoader(small_actual_train_subds, batch_size=batch_size, shuffle=True)
small_valid_dl = torch.utils.data.DataLoader(small_valid_subds, batch_size=batch_size)
small_test_dl = torch.utils.data.DataLoader(small_test_subds, batch_size=batch_size)
actual_train_dl = torch.utils.data.DataLoader(actual_train_subds, batch_size=batch_size, shuffle=True)
valid_dl = torch.utils.data.DataLoader(valid_subds, batch_size=batch_size)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=batch_size)

We will then create our first Neural Network. One way to create Neural Networks in PyTorch is by subclassing `torch.nn.Module`. In this way, our model will inherit a lot of ready-to-use convinience functions (access to parameters for optimization, get/set parameters, ...).

We only need to create the layers we will use in the `__init__` function and define the `forward` function that specifies how to apply them.

Layers are in turn subclasses of `torch.nn.Module`. In our example, we will use only linear layers, i.e. Fully Connected (FC) layers. Our network will have at least two FC layers: `self.first`, mapping the flattened input image into the (first) hidden representation, and `self.last`, mapping the (last) hidden representation into the scores for the classes.

To play with varying depths and activation functions, we will have two additional parameters:


*   `n_additional_hidden_layers`, specifies how many hidden layers our network has, beside `self.first`
*   `use_relu`, if `False`, activations will be sigmoid functions, ReLUs otherwise

Note that to store a variable number of layers in our network, we do not use plain PyTorch lists, but `torch.nn.ModuleList`. This is important to make PyTorch aware of the layers in the list, e.g. to set/get their parameters when calling the methods of the base `Module` class.

In [8]:
class TwoPlusLayersNetwork(torch.nn.Module):
  def __init__(self, n_features, hidden_width, n_classes, n_additional_hidden_layers=0, use_relu=True):
    super(TwoPlusLayersNetwork, self).__init__()
    self.first = torch.nn.Linear(n_features, hidden_width)
    self.activation = torch.relu if use_relu else torch.sigmoid
    self.last = torch.nn.Linear(hidden_width, n_classes)

    self.additional_hidden_layers = torch.nn.ModuleList(
        [torch.nn.Linear(hidden_width, hidden_width) for i in range(n_additional_hidden_layers)])

    #initialization
    for m in self.modules():
      if isinstance(m, torch.nn.Linear):
        if use_relu:
          torch.nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu")
        else:
          torch.nn.init.xavier_uniform_(m.weight)

  def forward(self, x):
    x = self.first.forward(x)
    x = self.activation(x)
    for layer in self.additional_hidden_layers:
      x = layer.forward(x)
      x = self.activation(x)
    x = self.last.forward(x)
    return x

We then define the usual function to train a model.

Note that we use
*   `nn.parameters()` to get a list of trainable parameters
*   `nn.state_dict()` to get the model parameters when we achieve better validation accuracy and save them in the `best_params` variable.

These are two of the convinience functions our network inherits from `torch.nn.Module`.

In [10]:
def ncorrect(scores, y):
  y_hat = torch.argmax(scores, 1)
  return (y_hat==y).sum()

def accuracy(scores, y):
  correct = ncorrect(scores, y)
  return correct.true_divide(y.shape[0])

def train_loop(n_features, hidden_width, n_classes, n_additional_hidden_layers, use_relu,
               train_dl, epochs, partial_opt,
               valid_dl=None, verbose=False):
  best_valid_acc = 0
  best_params = []
  best_epoch = -1

  nn = TwoPlusLayersNetwork(n_features, hidden_width, n_classes, n_additional_hidden_layers, use_relu).to(device)

  # We "complete" the partial function by calling it and specifying the missing parameters
  opt = partial_opt(nn.parameters())

  for e in range(epochs):
    #train
    train_loss = 0
    train_samples = 0
    train_acc = 0
    for train_data in train_dl:
      inputs, labels = train_data[0].to(device), train_data[1].to(device)

      scores = nn.forward(inputs)
      loss = F.cross_entropy(scores, labels)
      train_loss += loss.item() * inputs.shape[0]
      train_samples += inputs.shape[0]
      train_acc += ncorrect(scores, labels).item()
      loss.backward()

      opt.step()
      opt.zero_grad()

    train_acc /= train_samples
    train_loss /= train_samples

    # validation
    with torch.no_grad():
      valid_loss = 0
      valid_samples = 0
      valid_acc = 0
      if valid_dl is not None:
        for valid_data in valid_dl:

          inputs, labels = valid_data[0].to(device), valid_data[1].to(device)

          valid_scores = nn.forward(inputs)
          valid_loss += F.cross_entropy(valid_scores, labels).item() * inputs.shape[0]
          valid_samples += inputs.shape[0]
          valid_acc += ncorrect(valid_scores, labels).item()
        valid_acc /= valid_samples
        valid_loss /= valid_samples

      if valid_dl is None or valid_acc > best_valid_acc:
        best_valid_acc = valid_acc if valid_dl is not None else 0
        best_params = nn.state_dict()
        best_epoch = e


    if verbose and e % 10 == 0:
      print(f"Epoch {e}: train loss {train_loss:.3f} - train acc {train_acc:.3f}" + ("" if valid_dl is None else f" - valid loss {valid_loss:.3f} - valid acc {valid_acc:.3f}"))

  if verbose and valid_dl is not None:
    print(f"Best epoch {best_epoch}, best acc {best_valid_acc}")

  return best_valid_acc, best_params, best_epoch

The two functions are similar, but serve different purposes.

`parameters()` returns a list (actually, a generator) of trainable tensors, which is what `Optimizer`s require: they do not need to know which tensor correspond to which layer, since they are all treated the same when performing SGD.

In [11]:
hidden_width = 50
nn = TwoPlusLayersNetwork(n_features, hidden_width, n_classes, n_additional_hidden_layers=1, use_relu=True)
for p in nn.parameters():
  print(type(p), p.shape)

<class 'torch.nn.parameter.Parameter'> torch.Size([50, 3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([50])
<class 'torch.nn.parameter.Parameter'> torch.Size([10, 50])
<class 'torch.nn.parameter.Parameter'> torch.Size([10])
<class 'torch.nn.parameter.Parameter'> torch.Size([50, 50])
<class 'torch.nn.parameter.Parameter'> torch.Size([50])


`state_dict()` instead is an (Ordered) Dictionary, which associates each variable storing a layer in our classes with its parameters. It is therefore useful to obtain a snapshot of the parameters of our model that can later be restored by calling `load_state_dict()`.

In [12]:
type(nn.state_dict())

collections.OrderedDict

In [13]:
nn.state_dict().keys()

odict_keys(['first.weight', 'first.bias', 'last.weight', 'last.bias', 'additional_hidden_layers.0.weight', 'additional_hidden_layers.0.bias'])

In [14]:
nn.state_dict()["first.bias"]

tensor([ 0.0180,  0.0052,  0.0139, -0.0105, -0.0108,  0.0158, -0.0097, -0.0009,
        -0.0046, -0.0017, -0.0159, -0.0073, -0.0169, -0.0115,  0.0169, -0.0065,
         0.0158, -0.0052,  0.0081, -0.0075, -0.0062,  0.0034,  0.0023,  0.0174,
         0.0093, -0.0079, -0.0119,  0.0161,  0.0129, -0.0045,  0.0158,  0.0039,
         0.0175,  0.0127,  0.0158,  0.0157,  0.0009,  0.0030, -0.0024, -0.0043,
         0.0021,  0.0159,  0.0047, -0.0154,  0.0025, -0.0077, -0.0096,  0.0031,
         0.0121,  0.0042])

Let's verify that the use of the sigmoid as activation function makes it more difficult to train "deep" networks, i.e. with 10 hidden layers.

In [15]:
start = timer()
lr=1e-3
hidden_width = 500
n_additional_hidden_layers = 100
use_relu = False
p_opt = partial(torch.optim.Adam, lr=lr)

train_loop(n_features, hidden_width, n_classes, n_additional_hidden_layers, use_relu,
           train_dl=small_actual_train_dl, epochs=100, partial_opt=p_opt,
           valid_dl=small_valid_dl, verbose=True)
end = timer()
print(f"Elapsed time (s): {end-start}")

Epoch 0: train loss 2.398 - train acc 0.110 - valid loss 2.416 - valid acc 0.100
Epoch 10: train loss 2.303 - train acc 0.116 - valid loss 2.305 - valid acc 0.090
Epoch 20: train loss 2.298 - train acc 0.116 - valid loss 2.311 - valid acc 0.090
Epoch 30: train loss 2.299 - train acc 0.110 - valid loss 2.308 - valid acc 0.090
Epoch 40: train loss 2.300 - train acc 0.106 - valid loss 2.309 - valid acc 0.090
Epoch 50: train loss 2.301 - train acc 0.100 - valid loss 2.305 - valid acc 0.100
Epoch 60: train loss 2.301 - train acc 0.116 - valid loss 2.304 - valid acc 0.100
Epoch 70: train loss 2.299 - train acc 0.116 - valid loss 2.307 - valid acc 0.090
Epoch 80: train loss 2.299 - train acc 0.098 - valid loss 2.307 - valid acc 0.100
Epoch 90: train loss 2.299 - train acc 0.116 - valid loss 2.306 - valid acc 0.090
Epoch 100: train loss 2.300 - train acc 0.116 - valid loss 2.309 - valid acc 0.090
Epoch 110: train loss 2.298 - train acc 0.116 - valid loss 2.304 - valid acc 0.090
Epoch 120: trai

Let's compare this with ReLU.

In [18]:
start = timer()
lr=1e-3
hidden_width = 50
n_additional_hidden_layers = 10
use_relu = True
p_opt = partial(torch.optim.Adam, lr=lr)

train_loop(n_features, hidden_width, n_classes, n_additional_hidden_layers, use_relu,
           train_dl=small_actual_train_dl, epochs=100, partial_opt=p_opt,
           valid_dl=small_valid_dl, verbose=True)
end = timer()
print(f"Elapsed time (s): {end-start}")

Epoch 0: train loss 2.362 - train acc 0.084 - valid loss 2.339 - valid acc 0.090
Epoch 10: train loss 2.191 - train acc 0.186 - valid loss 2.250 - valid acc 0.150
Epoch 20: train loss 1.993 - train acc 0.250 - valid loss 2.225 - valid acc 0.180
Epoch 30: train loss 1.781 - train acc 0.354 - valid loss 2.254 - valid acc 0.240
Epoch 40: train loss 1.534 - train acc 0.440 - valid loss 2.367 - valid acc 0.230
Epoch 50: train loss 1.402 - train acc 0.496 - valid loss 2.585 - valid acc 0.190
Epoch 60: train loss 1.284 - train acc 0.538 - valid loss 2.579 - valid acc 0.230
Epoch 70: train loss 1.110 - train acc 0.578 - valid loss 2.952 - valid acc 0.160
Epoch 80: train loss 0.975 - train acc 0.640 - valid loss 3.158 - valid acc 0.160
Epoch 90: train loss 0.883 - train acc 0.712 - valid loss 3.412 - valid acc 0.190
Best epoch 52, best acc 0.25
Elapsed time (s): 13.78893200001039


You can see that training of modestly deep networks (for today standards) with the sigmoid function is stuck, while the network using ReLUs increases its performance while training.

Let's then define a function to perform hyper-parameter tuning. Since this is a small network we can afford to validate also hyper-parameters defining the architecture, like the `hidden_width` of the layers, or the number of hidden layers. We will also run a loop over optimizers (wrapping learning rates), as usual.

In [19]:
def hyperparameter_tuning(n_features, n_classes, train_dl,
                          valid_dl, partial_opts, hidden_widths,
                          n_additional_hidden_layers_list, epochs=5):

  best_valid_acc = 0
  best_params = []
  best_hyper_params = []

  for hidden_width in hidden_widths:
    for n_additional_hidden_layers in n_additional_hidden_layers_list:
      for partial_opt in partial_opts:
        run_valid_acc, params, epoch = train_loop(n_features, hidden_width, n_classes, n_additional_hidden_layers, use_relu=True,
                  train_dl=train_dl, epochs=epochs, partial_opt=partial_opt, valid_dl=valid_dl, verbose=False)

        if run_valid_acc > best_valid_acc:
          best_valid_acc = run_valid_acc
          best_params = params
          best_hyper_params = [partial_opt, epoch, hidden_width, n_additional_hidden_layers]
          print(f"Improved result: acc {best_valid_acc:.3f}, best_hyper_params {best_hyper_params}")
  return best_hyper_params, best_params

Then, the usual function to define which combination of optimizers and learning rates we want to validate.

In [20]:
def build_optlist():
  lrs = [1e-3, 4e-3]
  betas = [0.9]
  opts = []
  #opts += [partial(torch.optim.SGD, lr=lr) for lr in lrs]
  opts += [partial(torch.optim.SGD, lr=lr, momentum=beta, nesterov=True) for lr in lrs for beta in betas]
  opts += [partial(torch.optim.Adam, lr=lr) for lr in lrs]
  #opts += [partial(torch.optim.RMSprop, lr=lr) for lr in lrs]
  return opts

build_optlist()

[functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.001, momentum=0.9, nesterov=True),
 functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True),
 functools.partial(<class 'torch.optim.adam.Adam'>, lr=0.001),
 functools.partial(<class 'torch.optim.adam.Adam'>, lr=0.004)]

Let' check everything works on the small `Dataset`s.

In [21]:
start=timer()
opts = build_optlist()
hidden_widths = [64, 128]
n_hidden_layers_list = [0,1]
best_hyper_params, best_params = hyperparameter_tuning(n_features, n_classes, small_actual_train_dl,
                  small_valid_dl, opts, hidden_widths, n_hidden_layers_list, epochs=100)
end=timer()
print(f"Elapsed time (s): {end-start:.3f}")
print(f"best optimizer {best_hyper_params[0]}, best epoch {best_hyper_params[1]},"
      f"best hidden_width {best_hyper_params[2]}, best n_additional_hidden {best_hyper_params[3]}")

Improved result: acc 0.260, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.001, momentum=0.9, nesterov=True), 88, 64, 0]
Improved result: acc 0.290, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True), 65, 64, 0]
Improved result: acc 0.310, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.001, momentum=0.9, nesterov=True), 55, 128, 0]
Improved result: acc 0.320, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True), 80, 128, 0]
Elapsed time (s): 193.476
best optimizer functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True), best epoch 80,best hidden_width 128, best n_additional_hidden 0


And then, let's validate on the real `Dataset`s.

In [None]:
start=timer()
opts = build_optlist()
hidden_widths = [128]
n_hidden_layers_list = [0,1]
best_hyper_params, best_params = hyperparameter_tuning(n_features, n_classes,
      actual_train_dl, valid_dl, opts, hidden_widths, n_hidden_layers_list, epochs=40)
end=timer()
print(f"Elapsed time (s): {end-start:.3f}")
print(f"best optimizer {best_hyper_params[0]}, best epoch {best_hyper_params[1]}, "
      f"best hidden_width {best_hyper_params[2]}, best n_additional_hidden {best_hyper_params[3]}")

Improved result: acc 0.472, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.001, momentum=0.9, nesterov=True), 39, 128, 0]
Improved result: acc 0.515, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True), 34, 128, 0]
Improved result: acc 0.522, best_hyper_params [functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True), 35, 128, 1]
Elapsed time (s): 2962.552
best optimizer functools.partial(<class 'torch.optim.sgd.SGD'>, lr=0.004, momentum=0.9, nesterov=True), best epoch 35, best hidden_width 128, best n_additional_hidden 1


Let's train on the full training set.

In [None]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [None]:
start = timer()
best_opt = best_hyper_params[0]
best_epochs = best_hyper_params[1]
_, best_params, best_epoch = train_loop(n_features=n_features,
        hidden_width=best_hyper_params[2], n_classes=n_classes,
        n_additional_hidden_layers=best_hyper_params[3], use_relu=True,
        train_dl=train_dl, epochs=best_epochs, partial_opt=best_opt, verbose=True)
end = timer()
print(f"Elapsed time (s): {end-start}")

Epoch 0: train loss 1.942 - train acc 0.311
Epoch 10: train loss 1.412 - train acc 0.503
Epoch 20: train loss 1.281 - train acc 0.548
Epoch 30: train loss 1.181 - train acc 0.583
Elapsed time (s): 311.61791934300027


And test on the full test set. To restore the parameters computed in training, we use the `load_state_dict` function.

In [None]:
nn = TwoPlusLayersNetwork(n_features, best_hyper_params[2], n_classes, best_hyper_params[3])
nn.load_state_dict(best_params)

start = timer()
test_samples = 0
test_acc = 0
for test_data in test_dl:
  test_scores = nn.forward(test_data[0])
  test_samples += test_data[0].shape[0]
  test_acc += ncorrect(test_scores, test_data[1]).item()
test_acc /= test_samples
end = timer()
print(f"Accuracy on full test set {test_acc:.3f}, elapsed time (s): {end-start:.3f}")


Accuracy on full test set 0.504, elapsed time (s): 1.580
