# AML 2022 Tutorial: Introduction to PyTorch

In [3]:
import torch
import numpy as np
import scipy
import matplotlib.pyplot as plt

## Tensors and Operations on Tensors

### Tensors

They are similar to numpy arrays with only one data type.

In [4]:
x = torch.tensor([[1,  2, 3], 
                  [4., 5, 6]])
x

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [5]:
x.shape

torch.Size([2, 3])

In [6]:
x.dtype

torch.float32

**Exercise 1:** Create a tensor `A` of shape `(4, 3)`, whose elements are drawn from the standard normal distribution.

In [7]:
# Answer to Exercise 1
A = torch.randn(4,3)

### Tensor operations

In [8]:
# Element-wise operations
# e.g., add 1 to all elements of x
x + 1

tensor([[2., 3., 4.],
        [5., 6., 7.]])

In [9]:
# Transpose of a matrix
x.T

tensor([[1., 4.],
        [2., 5.],
        [3., 6.]])

In [10]:
# Define another tensor
y = torch.tensor([[3., 4, 2], 
                  [1 , 3, 6]])

In [11]:
# Element-wise operations
# Must ensure x and y have the same shape
assert x.shape == y.shape
x / y

tensor([[0.3333, 0.5000, 1.5000],
        [4.0000, 1.6667, 1.0000]])

In [12]:
# Dot product/matrix multiplication
assert x.shape[1] == y.T.shape[0]
x @ y.T

tensor([[17., 25.],
        [44., 55.]])

**Exercise 2:** Create two tensors `a` and `b` each of shape `(3, 4, 2)` and tensor `c` with shape `(3, 2, 5)`, all of which are random numbers drawn from the standard normal distribution. Then perform the following:
1. Element-wise multiply `a` and `b` together. You should have a tensor of shape `(3, 4, 2)`.
2. Multiply the resulting matrix with `c`, keeping the first dimension unchanged. You should have a tensor of shape `(3, 4, 5)`.

In [13]:
# Answer to Exercise 2
a = torch.randn(3, 4, 2)
b = torch.randn(3, 4, 2)
c = torch.randn(3, 2, 5)
ab = torch.mul(a, b)
abc = torch.bmm(ab, c)
print(abc.shape)


torch.Size([3, 4, 5])


## Automatic differentiation

In [14]:
x = torch.tensor([[1., 2., 3.]], requires_grad=True)
x.grad is None

True

In [15]:
y = torch.sum(x ** 2)
y.backward()
x.grad

tensor([[2., 4., 6.]])

In [16]:
y2 = torch.sum(x ** 3)
y2.backward()
x.grad

tensor([[ 5., 16., 33.]])

In [17]:
# Equivalent way, although less used
from torch.autograd import grad
x = torch.tensor([[1., 2., 3.]], requires_grad=True)
y = torch.sum(x ** 2)
g = grad(y, x)
g

(tensor([[2., 4., 6.]]),)

Torch can also take the gradient with respect to more than one variable. In the below example, we have a function `z = f(x, y)` and can find the gradient of `z` with respect to `x` and to `y`.

In [18]:
x = torch.tensor([[1., 2, 3], [4, 5, 6]],
                     requires_grad=True)
y = torch.tensor([[6., 5, 4], [3, 2, 1]],
                     requires_grad=True)
# Frobenius norm: sum of squares of a matrix's entries
z = torch.linalg.norm(x @ y.T, ord="fro")

z.backward()

In [19]:
x.grad

tensor([[2.3671, 1.9128, 1.4585],
        [6.2404, 5.0330, 3.8256]])

In [20]:
y.grad

tensor([[3.8256, 5.0330, 6.2404],
        [1.4585, 1.9128, 2.3671]])

**Exercise 3:** You're given the matrix `A` of shape `(3, 4)` and two vectors `x` and `y` of shapes `(3,)` and `(4,)`, respectively. Consider the function of `x` and `y`:
$$
z = f(x, y) = x^\top A y.
$$
What is $\frac{\partial z}{\partial x}$ and $\frac{\partial z}{\partial y}$? Use the code below to check if your solution is correct?

In [21]:
# Do not change this code
torch.random.seed = 100
A = torch.rand(3, 4)
x = torch.rand(3, requires_grad=True)
y = torch.rand(4, requires_grad=True)
z = x.T @ A @ y
z.backward()

In [22]:
# Answer to Exercise 3
x_grad = x.grad  # Replace with the actual gradient
assert torch.allclose(x_grad, x.grad)

In [23]:
# Answer to Exercise 3
y_grad = y.grad  # Replace with the actual gradient
assert torch.allclose(y_grad, y.grad)

## Dataset and DataLoader

In [24]:
# Download the dataset and transform the features
from torchvision.datasets import USPS
from torchvision.transforms import Compose, ToTensor, Normalize
transform  = Compose([ToTensor(), lambda x: x / 255])
train_data = USPS(root="./", train=True, download=True, 
                  transform=transform)
train_data.data.shape

(7291, 16, 16)

In [25]:
train_data.targets[:5]

[6, 5, 4, 7, 3]

In [26]:
# Create a data loader
from torch.utils.data import DataLoader
trainloader = DataLoader(train_data, shuffle=True, batch_size=32)

**Exercise 4:** Download the _test_ data using the same transformation and create a `testloader` with the same batch size.

In [27]:
# Answer to Exercise 4
test_data = USPS(root="./", train=False, download=True, 
                  transform=transform)
print(test_data.data.shape)
testloader = DataLoader(train_data, shuffle=True, batch_size=32)

(2007, 16, 16)


## Model architecture

In [28]:
from torch import nn
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(16 * 16, 512)
        self.fc2 = nn.Linear(512, 1024)
        self.fc3 = nn.Linear(1024, 10)
    def forward(self, x):
        x = self.flatten(x)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
model = Net()
print(model)

Net(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=256, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=10, bias=True)
)


In [29]:
# Equivalent way, only for sequential architectures
from torch import nn
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(16 * 16, 512),
    nn.ReLU(),
    nn.Linear(512, 1024),
    nn.ReLU(),
    nn.Linear(1024, 10)
)
print(model)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=256, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=1024, bias=True)
  (4): ReLU()
  (5): Linear(in_features=1024, out_features=10, bias=True)
)


**Exercise 5:** Create a fully connected network with the layers in the following order:
1. Flatten the an image
2. Linear layer from 256 to 1,024
3. Hyperbolic tangent activation layer
4. Linear layer from 1,024 to 64
5. ReLU activation layer
6. Linear layer from 64 to 10

In [30]:
# Answer to Exercise 5
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(16 * 16, 1024),
    nn.Tanh(),
    nn.Linear(1024, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)

## Loss function and optimizer

In [31]:
# Loss function
from torch import nn
loss_fn = nn.CrossEntropyLoss()
outputs = torch.tensor([[1., 2, 3], [2, 2, 4]])
targets = torch.tensor([2, 1])
loss_fn(outputs, targets)

tensor(1.3236)

**Exercise 6:** Given two vectors `outputs` and `targets` like below, what is the mean squared error loss? Then find that loss function on the PyTorch documentation to verify if you're correct.

In [32]:
# Answer to Exercise 6
outputs = torch.tensor([1., 2, 3])
targets = torch.tensor([2 , 4, 5])
loss_fn = nn.MSELoss()  # Replace with appropriate loss function 
loss_fn(outputs, targets)

tensor(3.)

In [33]:
# Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

## Training loop and evaluation

In [34]:
from tqdm import tqdm
# Train a model using the training set
def train(model, trainloader, loss_fn, optimizer, epochs=100,
          device="cpu"):
    # Move model to device
    model.to(device)
    # Turn on training mode
    model.train()

    for i in tqdm(range(epochs)):
        # One epoch = one full pass over data
        for x, y in trainloader:
            # Move data to device
            x, y = x.to(device), y.to(device)
            # Predict the classes
            outputs = model(x)
            # Compare against the ground truth
            loss = loss_fn(outputs, y)
            # Find the gradients
            optimizer.zero_grad()
            loss.backward()
            # Perform gradient descent step
            optimizer.step()

In [35]:
# Evaluate a model using the test set
def test(model, testloader, device="cpu"):
    # Move model to device
    model.to(device)
    # Turn on evaluation mode
    model.eval()
    # Disable gradient calculation
    with torch.no_grad():
        correct, total = 0, 0
        for x, y in testloader:
            # Move data to device
            x, y = x.to(device), y.to(device)
            # Make predictions
            preds = model(x)
            preds = preds.argmax(1)
            # Count the correct predictions
            correct += torch.sum(preds == y).item()
            total += len(y)
    accuracy = correct / total
    return accuracy

## Putting everything together

**Exercise 7**: Use the model we defined in Exercise 5 and train it using SGD for 50 epochs. After that, what is the accuracy of our classifier on the test set?

In [36]:
# Answer to Exercise 7
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(16 * 16, 1024),
    nn.Tanh(),
    nn.Linear(1024, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, trainloader, loss_fn, optimizer, epochs=20)

100%|██████████| 20/20 [00:45<00:00,  2.28s/it]


In [37]:
test(model, testloader)

0.9325195446440817

## Using a GPU for faster computation

For environments that support GPU acceleration, you can change the device on which you store tensors.

In [38]:
device = "cuda:0"
x = torch.randn(4, 3, device=device)
x

AssertionError: Torch not compiled with CUDA enabled

Operations on tensors must be done on the same device. For example, you cannot add a tensor on a CPU with another tensor on a GPU.

In [None]:
x = torch.randn(4, 3, device="cuda:0")
y = torch.randn(4, 3, device="cpu")
x + y

RuntimeError: ignored

In [None]:
y = y.to("cuda:0")
x + y

tensor([[ 1.8644, -0.9710,  1.1157],
        [ 1.5925, -2.5518, -0.9779],
        [ 2.1114, -0.2590, -1.1856],
        [-0.5814, -0.3382, -1.0386]], device='cuda:0')

You can also port an model to a GPU easily. This basically places all trainable parameters to the GPU you choose.

In [None]:
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(256, 1024),
    nn.Tanh(),
    nn.Linear(1024, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)
model.to("cuda:0")
print(model)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=256, out_features=1024, bias=True)
  (2): Tanh()
  (3): Linear(in_features=1024, out_features=64, bias=True)
  (4): ReLU()
  (5): Linear(in_features=64, out_features=10, bias=True)
)


**Exercise 8**: Go back to Exercise 7. Identify what else needs to be transferred to a GPU to train the model. Then repeat the training process and observe how long training takes.

In [None]:
# Answer to Exercise 8
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(16 * 16, 1024),
    nn.Tanh(),
    nn.Linear(1024, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, trainloader, loss_fn, optimizer, epochs=20,
      device="cuda:0")

100%|██████████| 20/20 [00:28<00:00,  1.40s/it]


In [None]:
test(model, testloader, device="cuda:0")

0.9374571389384172

## Avoiding overfitting: regularization

We will perform 2 ways to implement regularization: random dropout and $\ell_2$ regularization.

### Dropout

This will randomly zero out some neurons in a layer, and rescale the rest of the neurons.

In [None]:
# First, redefine the model
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(256, 1024),
    nn.Tanh(),
    nn.Linear(1024, 64),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(64, 10)
)
# Then, define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, trainloader, loss_fn, optimizer, epochs=20, device="cuda:0")
test(model, testloader, device="cuda:0")

100%|██████████| 20/20 [00:29<00:00,  1.45s/it]


0.9465093951446989

**Exercise 9**: Implement another dropout layer at the end of the first hidden layer, with the probability of 50%. Observe the test error.

In [None]:
# Answer to Exercise 9

### $\ell_2$ regularization

Assume that the loss function is $f(w)$, where $w$ is the weights of our model. This regularization adds an additional term $\frac{\lambda}{2} \lVert w \rVert^2$ to $f(w)$. The effect of this is that $w$ will be driven closer to $0$.

In [None]:
# First, redefine the model
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(256, 1024),
    nn.Tanh(),
    nn.Linear(1024, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)
# Then, define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, 
                             weight_decay=1e-5)
train(model, trainloader, loss_fn, optimizer, epochs=20, 
      device="cuda:0")
test(model, testloader, device="cuda:0")

100%|██████████| 20/20 [00:28<00:00,  1.45s/it]


0.9343025648059251

**Exercise 10**: Try a few different values of `weight_decay` and observe the test accuracy.

In [None]:
weight_decay_values = [1, 3e-1, 1e-1, 3e-2, 1e-2]