In [104]:
import torch

## Creating PyTorch tensors

In [105]:
# 1 Creates a zero-dimensional tensor (scalar) from a Python integer
tensor0d = torch.tensor(1)

# 2 Creates a one-dimensional tensor (vector) from a Python list
tensor1d = torch.tensor([1, 2, 3])

# 3 Creates a two-dimensional tensor from a nested Python list
tens0r2d = torch.tensor([[1, 2],
                        [3, 4]])

# 4 Creates a three-dimensional tensor from a nested Python list
tensor3d = torch.tensor([
    [[1, 2], [3, 4]],
    [[5, 6], [7, 8]]
])

## Tensor Data Types

In [106]:
print(tensor1d.dtype)

torch.int64


In [107]:
floatvec = torch.tensor([1.0, 2.0, 3.0])
print(floatvec.dtype)

torch.float32


In [108]:
floatvec = tensor1d.to(torch.float32)
print(floatvec.dtype)

torch.float32


## Common PyTorch tensor operations

In [109]:
tensor2d = torch.tensor([
    [1, 2, 3],
    [4, 5, 6]
])
print(tensor2d)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [110]:
print(tensor2d.shape)

torch.Size([2, 3])


In [111]:
print(tensor2d.reshape(3, 2))

tensor([[1, 2],
        [3, 4],
        [5, 6]])


In [112]:
print(tensor2d.view(3, 2))

tensor([[1, 2],
        [3, 4],
        [5, 6]])


In [113]:
print(tensor2d.T)

tensor([[1, 4],
        [2, 5],
        [3, 6]])


In [114]:
print(tensor2d.matmul(tensor2d.T))

tensor([[14, 32],
        [32, 77]])


In [115]:
print(tensor2d @ tensor2d.T)

tensor([[14, 32],
        [32, 77]])


## A logistic regression forward pass

In [116]:
# 1 This import statement is a common convention in PyTorch to prevent long lines of code.
import torch.nn.functional as F

# 2 True label
y = torch.tensor([1.0])

# 3 Input feature
x1 = torch.tensor([1.1])

# 4 Weight parameter
w1 = torch.tensor([2.2])

# 5 Bias unit
b = torch.tensor([0.0])

# 6 Net input
z = x1 * w1 + b

# 7 Activation and output
a = torch.sigmoid(z)
loss = F.binary_cross_entropy(a, y)

## Computing gradients via autograd

In [117]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad = True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

#1 By default, PyTorch destroys the computation graph after calculating 
#   the gradients to free memory. However, since we will reuse this computation graph shortly, we set retain_graph=True so that it stays in memory.
grad_L_w1 = grad(loss, w1, retain_graph=True)
grad_L_b = grad(loss, b, retain_graph=True)

In [118]:
print(grad_L_w1)
print(grad_L_b)

(tensor([-0.0898]),)
(tensor([-0.0817]),)


In [119]:
# For instance, we can call .backward on the loss, and 
#   PyTorch will compute the gradients of all the leaf nodes in the graph, 
#   which will be stored via the tensors’ .grad attributes:
loss.backward()
print(w1.grad)
print(b.grad)

tensor([-0.0898])
tensor([-0.0817])


## Implementing multilayer neural networks

When implementing a neural network in PyTorch, we can subclass the torch.nn.Module class to define our own custom network architecture. This Module base class provides a lot of functionality, making it easier to build and train models. For instance, it allows us to encapsulate layers and operations and keep track of the model’s parameters.

Within this subclass, we define the network layers in the __init__ constructor and specify how the layers interact in the forward method. The forward method describes how the input data passes through the network and comes together as a computation graph. In contrast, the backward method, which we typically do not need to implement ourselves, is used during training to compute gradients of the loss function given the model parameters

In [120]:
class NeuralNetwork(torch.nn.Module):
    # 1 Coding the number of inputs and outputs as variables allows us 
    # to reuse the same code for datasets with different numbers of features and classes
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(

            # 1st hidden layer
            # 2 The Linear layer takes the number of input and output nodes as arguments.
            torch.nn.Linear(num_inputs, 30),
            # 3 Nonlinear activation functions are placed between the hidden layers.
            torch.nn.ReLU(),

            # 2nd hidden layer
            # 4 The number of output nodes of one hidden layer has to match the number of inputs of the next layer.
            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs)
        )
    
    def forward(self, x):
        logits = self.layers(x)
        # 5 The outputs of the last layer are called logits.
        return logits

In [121]:
model = NeuralNetwork(num_inputs=50, num_outputs=3)

In [122]:
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [123]:
# The total number of trainable parameters of this model
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters: ", num_params)

Total number of trainable model parameters:  2213


In the case of our neural network model with the preceding two hidden layers, these trainable parameters are contained in the torch.nn.Linear layers. A Linear layer multiplies the inputs with a weight matrix and adds a bias vector. This is sometimes referred to as a feedforward or fully connected layer.

In [124]:
print(model.layers[0].weight)

Parameter containing:
tensor([[ 0.1388,  0.0159,  0.1215,  ...,  0.1032,  0.0296,  0.0102],
        [ 0.0229,  0.0260, -0.0458,  ..., -0.0358,  0.0362,  0.0497],
        [-0.0896,  0.0113,  0.1370,  ...,  0.1037,  0.1230, -0.0929],
        ...,
        [-0.1362, -0.0713, -0.0010,  ...,  0.1176,  0.1054, -0.1012],
        [ 0.1226,  0.0937, -0.1409,  ...,  0.1321, -0.0613,  0.0086],
        [-0.0045, -0.0604,  0.0535,  ...,  0.0697,  0.0373,  0.0923]],
       requires_grad=True)


In [125]:
print(model.layers[0].weight.shape)

torch.Size([30, 50])


The model weights are initialized with small random numbers, which differ each time we instantiate the network. In deep learning, initializing model weights with small random numbers is desired to break symmetry during training. Otherwise, the nodes would be performing the same operations and updates during backpropagation, which would not allow the network to learn complex mappings from inputs to outputs.

In [126]:
torch.manual_seed(123)
model = NeuralNetwork(50, 3)
print(model.layers[0].weight)

Parameter containing:
tensor([[-0.0577,  0.0047, -0.0702,  ...,  0.0222,  0.1260,  0.0865],
        [ 0.0502,  0.0307,  0.0333,  ...,  0.0951,  0.1134, -0.0297],
        [ 0.1077, -0.1108,  0.0122,  ...,  0.0108, -0.1049, -0.1063],
        ...,
        [-0.0787,  0.1259,  0.0803,  ...,  0.1218,  0.1303, -0.1351],
        [ 0.1359,  0.0175, -0.0673,  ...,  0.0674,  0.0676,  0.1058],
        [ 0.0790,  0.1343, -0.0293,  ...,  0.0344, -0.0971, -0.0509]],
       requires_grad=True)


In [127]:
torch.manual_seed(123)
X = torch.rand((1, 50))
out = model(X)

# a score assigned to each of the three output nodes
# The <AddmmBackward0> part of grad_fn=<AddmmBackward0> specifies the operation performed. 
# In this case, it is an Addmm operation. 
# Addmm stands for matrix multiplication (mm) followed by an addition (Add).
print(out)

tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)


when we use a model for inference (for instance, making predictions) rather than training, the best practice is to use the torch.no_grad() context manager. This tells PyTorch that it doesn’t need to keep track of the gradients, which can result in significant savings in memory and computation:

In [128]:
with torch.no_grad():
    out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]])


In [129]:
# compute class-membership probabilities
with torch.no_grad():
    out = torch.softmax(model(X), dim = 1)
print(out)

tensor([[0.3113, 0.3934, 0.2952]])


## Setting up efficient data loaders

In [130]:
# creating a small toy dataset
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])
y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])
y_test = torch.tensor([0, 1])

In [131]:
# Defining a custom Dataset class
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y
    
    # 1 Instructions for retrieving exactly one data record and the corresponding label
    def __getitem__(self, index):
       one_x = self.features[index]
       one_y = self.labels[index]
       return one_x, one_y

    # 2 Instructions for returning the total length of the dataset
    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test) 

In [132]:
print(len(train_ds))

5


In [133]:
# Instantiating data loaders
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    # 1 The ToyDataset instance created earlier serves as input to the data loader.
    dataset = train_ds,
    batch_size = 2,
    # 2 Whether or not to shuffle the data
    shuffle = True,
    # 3 The number of background processes
    num_workers = 0
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    #4 It is not necessary to shuffle a test dataset.
    shuffle=False,
    num_workers=0
)

In [134]:
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx + 1}: ", x, y)

Batch 1:  tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2:  tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 3:  tensor([[ 2.7000, -1.5000]]) tensor([1])


In [135]:
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
)

In [136]:
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 2: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])


## A typical training loop

In [137]:
# Neural Network training in PyTorch
import torch.nn.functional as F

torch.manual_seed(123)
# 1 The dataset has two features and two classes.
model = NeuralNetwork(num_inputs = 2, num_outputs = 2)

# 2 The optimizer needs to know which parameters to optimize.
optimizer = torch.optim.SGD(
    model.parameters(),
    lr = 0.5
)

num_epochs = 3
for epoch in range(num_epochs):

    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)

        loss = F.cross_entropy(logits, labels)

        # 3 Sets the gradients from the previous round to 0 to prevent unintended gradient accumulation
        optimizer.zero_grad()
        # 4 Computes the gradients of the loss given the model parameters
        loss.backward()
        # 5 The optimizer uses the gradients to update the model parameters.
        optimizer.step()

        ### Logging
        print(  f"Epoch: {epoch+1: 03d}/{num_epochs:03d}"
                f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
                f" | Train Loss: {loss:.2f}")
        model.eval()

        # Insert optional model evaluation code

Epoch:  01/003 | Batch 000/002 | Train Loss: 0.75
Epoch:  01/003 | Batch 001/002 | Train Loss: 0.65
Epoch:  02/003 | Batch 000/002 | Train Loss: 0.44
Epoch:  02/003 | Batch 001/002 | Train Loss: 0.13
Epoch:  03/003 | Batch 000/002 | Train Loss: 0.03
Epoch:  03/003 | Batch 001/002 | Train Loss: 0.00


In [138]:
model.parameters()

<generator object Module.parameters at 0x000001DA48BEE490>

In [139]:
model.eval()
with torch.no_grad():
    outputs = model(X_train)
print(outputs)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])


In [140]:
torch.set_printoptions(sci_mode=False)
probas = torch.softmax(outputs, dim = 1)
print(probas)

tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])


In [141]:
predictions = torch.argmax(probas, dim = 1)
print(predictions)

tensor([0, 0, 0, 1, 1])


In [142]:
predictions == y_train

tensor([True, True, True, True, True])

In [143]:
torch.sum(predictions == y_train)

tensor(5)

In [144]:
# A function to compute the prediction accuracy
def compute_accuracy(model, dataloader):

    model = model.eval()
    correct = 0.0
    total_examples = 0

    for idx, (features, labels) in enumerate(dataloader):
        with torch.no_grad():
            logits = model(features)

        predictions = torch.argmax(logits, dim = 1)
        
        # 1 Returns a tensor of True/False values depending on whether the labels match
        compare = labels == predictions

        # 2 The sum operation counts the number of True values.
        correct += torch.sum(compare)
        total_examples += len(compare)

    # 3 The fraction of correct prediction, a value between 0 and 1.
    #  .item() returns the value of the tensor as a Python float.
    return (correct / total_examples).item()

In [145]:
print(compute_accuracy(model, train_loader))

1.0


In [146]:
print(compute_accuracy(model, test_loader))

1.0


## Saving and loading models

In [147]:
torch.save(model.state_dict(), "model.pth")

The torch.load("model.pth") function reads the file "model.pth" and reconstructs the Python dictionary object containing the model’s parameters while model.load_state_dict() applies these parameters to the model, effectively restoring its learned state from when we saved it.

The line model = NeuralNetwork(2, 2) is not strictly necessary if you execute this code in the same session where you saved a model. However, I included it here to illustrate that we need an instance of the model in memory to apply the saved parameters. Here, the NeuralNetwork(2, 2) architecture needs to match the original saved model exactly.

In [148]:
model = NeuralNetwork(2, 2)
model.load_state_dict(torch.load("model.pth"))

  model.load_state_dict(torch.load("model.pth"))


<All keys matched successfully>

## Optimizing training performance with GPUs

In [149]:
# PyTorch computations on GPU devices
print(torch.cuda.is_available())

True


In [150]:
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])


In [151]:
tensor_1 = tensor_1.to("cuda")
tensor_2 = tensor_2.to("cuda")
print(tensor_1 + tensor_2)

tensor([5., 7., 9.], device='cuda:0')


In [152]:
tensor_1 = tensor_1.to("cpu")
print(tensor_1 + tensor_2)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [154]:
# Single-GPU training
torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)

device = torch.device("cuda")      #1 Defines a device variable that defaults to a GPU
model = model.to(device)          #2 Transfers the model onto the GPU

optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3

for epoch in range(num_epochs):

    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)   #3 Transfers the data onto the GPU
        logits = model(features)
        loss = F.cross_entropy(logits, labels) # Loss function

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train/Val Loss: {loss:.2f}")

    model.eval()
    # Insert optional model evaluation code

Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00


In [155]:
# Best Practice
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch’s DistributedDataParallel (DDP) strategy. DDP enables parallelism by splitting the input data across the available devices and processing these data subsets simultaneously.

In [156]:
# Training with multiple GPUs
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

PyTorch’s multiprocessing submodule contains functions such as multiprocessing .spawn, which we will use to spawn multiple processes and apply a function to multiple inputs in parallel. We will use it to spawn one training process per GPU. If we spawn multiple processes for training, we will need a way to divide the dataset among these different processes. For this, we will use the DistributedSampler.

init_process_group and destroy_process_group are used to initialize and quit the distributed training mods. The init_process_group function should be called at the beginning of the training script to initialize a process group for each process in the distributed setup, and destroy_process_group should be called at the end of the training script to destroy a given process group and release its resources. The code in the following listing illustrates how these new components are used to implement DDP training for the NeuralNetwork model we implemented earlier.

In [157]:
# Model training with the DistributedDataParallel strategy
def ddp_setup(rank, world_size):
    # 1 Address of the main node
    os.environ["MASTER_ADDR"] = "localhost"
    # 2 Any free port on the machine
    os.environ["MASTER_PORT"] = "12345"

    init_process_group(
        # 3 nccl stands for NVIDIA Collective Communication Library.
        backend="nccl",
        # 4 rank refers to the index of the GPU we want to use.
        rank=rank,
        # 5 world_size is the number of GPUs to use.
        world_size=world_size
    )

    # 6 Sets the current GPU device on which tensors 
    # will be allocated and operations will be performed
    torch.cuda.set_device(rank)

def prepare_dataset():
    # insert dataset preparation code
    train_loader = DataLoader(
        dataset=train_ds,
        batch_size=2,

        # 7 Distibuted-Sampler takes care of the shuffling now.
        shuffle=False,

        # 8 Enables faster memory transfer when training on GPU
        pin_memory=True,
        drop_last=True,

        # 9  Splits the dataset into distinct, non-overlapping subsets for each process (GPU)
        sampler=DistributedSampler(train_ds)
    )
    return train_loader, test_loader

In [164]:
# 10 The main function running the model training
def main(rank, world_size, num_epochs):
    ddp_setup(rank, world_size)
    train_loader, test_loader = prepare_dataset()
    model = NeuralNetwork(num_inputs=2, num_outputs=2)
    model.to(rank)

    optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
    model = DDP(model, device_ids=[rank])

    for epoch in range(num_epochs):
        for features, labels in train_loader:
            # 11 rank is the GPU ID
            features, labels = features.to(rank), labels.to(rank)
            # insert model prediction and backpropagation code
            print(f"[GPU{rank}] Epoch: {epoch+1:03d}/{num_epochs:03d}"
                    f" | Batchsize {labels.shape[0]:03d}"
                    f" | Train/Val Loss: {loss:.2f}")
                
    model.eval()
    train_acc = compute_accuracy(model, train_loader, device=rank)
    print(f"[GPU{rank}] Training accuracy", train_acc)
    test_acc = compute_accuracy(model, test_loader, device=rank)
    print(f"[GPU{rank}] Test accuracy", test_acc)
    # 12 Cleans up resource allocation
    destroy_process_group()      

In [165]:
if __name__ == "__main__":
    print("Number of GPUs available:", torch.cuda.device_count())
    torch.manual_seed(123)
    num_epochs = 3
    world_size = torch.cuda.device_count()
    
    # 13 Launches the main function using multiple processes, 
    # where nprocs=world_size means one process per GPU.
    mp.spawn(main, args=(world_size, num_epochs), nprocs=world_size)

Number of GPUs available: 1


ProcessExitedException: process 0 terminated with exit code 1