## Mixed precision Training

* Ordinarily, “automatic mixed precision training” uses torch.autocast and torch.cuda.amp.GradScaler together.

In [1]:
import torch
from torch import nn, optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast

In [64]:
# Define your model architecture
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10))

    def forward(self, x):
        x = self.flatten(x)
        logit = self.linear_relu_stack(x)
        return logit

In [None]:
# Load your dataset (for example, MNIST)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_data = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

In [4]:
train_data

Dataset MNIST
    Number of datapoints: 60000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5,), std=(0.5,))
           )

In [18]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
# Initialize your model, optimizer, and scaler
model = MyModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = GradScaler()

cuda:0


In [19]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7ad7c8fa4880>

## torch.autocast

* serve as context managers that allow regions of your script to run in mixed precision.
* CUDA ops run in a dtype chosen by autocast to improve performance while maintaining accuracy.

In [20]:
# Training loop
for epoch in range(5):  # Example: 5 epochs
    for batch_idx, (data, target) in enumerate(train_loader):
        # Move data to GPU
        data, target = data.to(device), target.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass with autocast for mixed precision
        with autocast(dtype=torch.float16):
            output = model(data)
            assert output.dtype is torch.float16
            loss = nn.functional.cross_entropy(output, target)
            assert loss.dtype is torch.float32

        # Backward pass
        scaler.scale(loss).backward()

        # Update model parameters
        scaler.step(optimizer)

        # Update scaler for next iteration
        scaler.update()

        # Print training progress
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch + 1}/{5}], Step [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item()}')

print("Training complete.")


Epoch [1/5], Step [1/938], Loss: 2.29779052734375
Epoch [1/5], Step [101/938], Loss: 0.6141790747642517
Epoch [1/5], Step [201/938], Loss: 0.4575044512748718
Epoch [1/5], Step [301/938], Loss: 0.1804271787405014
Epoch [1/5], Step [401/938], Loss: 0.2871459722518921
Epoch [1/5], Step [501/938], Loss: 0.39320069551467896
Epoch [1/5], Step [601/938], Loss: 0.17561626434326172
Epoch [1/5], Step [701/938], Loss: 0.2798739969730377
Epoch [1/5], Step [801/938], Loss: 0.2054775357246399
Epoch [1/5], Step [901/938], Loss: 0.20905619859695435
Epoch [2/5], Step [1/938], Loss: 0.0883687436580658
Epoch [2/5], Step [101/938], Loss: 0.3113597631454468
Epoch [2/5], Step [201/938], Loss: 0.08547207713127136
Epoch [2/5], Step [301/938], Loss: 0.07507798075675964
Epoch [2/5], Step [401/938], Loss: 0.04894807189702988
Epoch [2/5], Step [501/938], Loss: 0.19935643672943115
Epoch [2/5], Step [601/938], Loss: 0.11764001846313477
Epoch [2/5], Step [701/938], Loss: 0.3508676290512085
Epoch [2/5], Step [801/938

In [14]:
loss.dtype is torch.float32

True

## Adding GradScaler
Gradient scaling helps prevent gradients with small magnitudes from flushing to zero (“underflowing”) when training with mixed precision.

In [None]:
# Constructs a ``scaler`` once, at the beginning of the convergence run, using default arguments.
# The same ``GradScaler`` instance should be used for the entire convergence run.
# If you perform multiple convergence runs in the same script, each run should use
# a dedicated fresh ``GradScaler`` instance. ``GradScaler`` instances are lightweight.
scaler = torch.cuda.amp.GradScaler()

for epoch in range(5): # 0 epochs, this section is for illustration only
    for batch_idx, (data, target) in enumerate(train_loader):
        # Move data to GPU
        data, target = data.to(device), target.to(device)

        # Zero the gradients
        optimizer.zero_grad()
        with autocast(dtype=torch.float16):
            output = model(data)
            loss = nn.functional.cross_entropy(output, target)

        # Scales loss. Calls ``backward()`` on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # ``scaler.step()`` first unscales the gradients of the optimizer's assigned parameters.
        # If these gradients do not contain ``inf``s or ``NaN``s, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

        optimizer.zero_grad() # set_to_none=True here can modestly improve performance

In [None]:
use_amp = True

opt = torch.optim.SGD(model.parameters(), lr=0.001)
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

for epoch in range(5):
    for input, target in zip(data, target):
        with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
            output = model(input)
            loss = nn.functional.cross_entropy(output, target)
        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
