# Mixed Precision Pytorch

In [1]:
import torch
import numpy as np
import time
from IPython import display

In [2]:
N = 64_000
D_in   = 10240
D_out  = 5120
epochs = 100

# Apex.amp, deprecated, should use PyTorch

DeprecatedFeatureWarning: apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)

In [3]:
from apex import amp

In [4]:
x = torch.randn(N, D_in, device="cuda")
y = torch.randn(N, D_out, device="cuda")
model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # changes


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


  optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);


In [5]:
st = time.time()

for t in range(epochs):
    y_pred = model(x)
    loss = torch.nn.functional.mse_loss(y_pred, y)
    optimizer.zero_grad()
    
    with amp.scale_loss(loss, optimizer) as scaled_loss: # changes
        scaled_loss.backward()
        
    optimizer.step()
    
ed = time.time()
print(f'time: {np.round((ed-st),2)} sec')

# 13G GPU, 22 sec
# 01/02(mixed) --> 22 sec
# 03 (fp16) --> 9sec
# 00 (fp32) --> error

time: 22.38 sec


# Automatic Mixed Precision package - torch.amp

Refrences:

https://pytorch.org/docs/stable/amp.html

https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html

https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples

torch.autocast("cuda", args...)   ==   torch.cuda.amp.autocast(args...).

torch.autocast("cpu", args...) == torch.cpu.amp.autocast(args...). 

In [1]:
import torch
import numpy as np
import time
from IPython import display

In [2]:
N = 64_000
D_in   = 10240
D_out  = 5120
epochs = 100

In [3]:
from torch.cuda.amp import GradScaler #, autocast
from torch import autocast

max_norm = 1.0

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = torch.device("cuda") if \
#     torch.cuda.is_available() else torch.device("cpu")
print('Using device:', device)

Using device: cuda


refrence: https://pytorch.org/docs/stable/amp.html

Ordinarily, “automatic mixed precision training” with datatype of torch.float16 uses torch.autocast and torch.cuda.amp.GradScaler together, as shown in the CUDA Automatic Mixed Precision examples and CUDA Automatic Mixed Precision recipe. However, torch.autocast and torch.cuda.amp.GradScaler are modular, and may be used separately if desired. As shown in the CPU example section of torch.autocast, “automatic mixed precision training/inference” on CPU with datatype of torch.bfloat16 only uses torch.autocast.

torch.float16  -->      uses torch.autocast and torch.cuda.amp.GradScaler together,

torch.bfloat16 --> only uses torch.autocast

In [5]:
# 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
dtype

torch.bfloat16

In [6]:
# hint
dtype == 'bfloat16',  dtype == torch.bfloat16

(False, True)

In [7]:
# for bfloat16, scaler will be disabled
scaler = GradScaler(enabled=(dtype == torch.float16))

In [8]:
# Example
# @autocast()
# def func():

class AutocastModel(torch.nn.Module):
    ...
    @autocast(device)
    def forward(self, input):
        ...

In [9]:
x = torch.rand(N, D_in).to(device)
y = torch.rand(N, D_out).to(device)

# Creates model and optimizer in default precision
model     = torch.nn.Linear(D_in, D_out).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [10]:
st = time.time()

for t in np.arange(epochs):

    optimizer.zero_grad()

    # Enables autocasting for the forward pass (model + loss)
    with autocast(device_type=device, enabled=True, dtype=dtype): # torch.float16,  torch.bfloat16
        y_pred = model(x)
        loss = torch.nn.functional.mse_loss(y_pred.float(), y)

##### Exits the context manager before backward()
#     loss.backward()
#     optimizer.step()
    scaler.scale(loss).backward()
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
    scaler.step(optimizer)
    scaler.update()
    
ed = time.time()
print(f'time: {np.round((ed-st),2)} sec')

# 12.8G GPU, 21.8 sec
# disable fp16 --> 38.2 sec

time: 20.62 sec


# Monitor

======== Warning: nvprof is not supported on devices with compute capability 8.0 and higher.

Check your Nvidia GPUs Compute Capability:
https://developer.nvidia.com/cuda-gpus#compute

In [11]:
!nvprof python pytorch_mixed.py

                  Use NVIDIA Nsight Systems for GPU tracing and CPU sampling and NVIDIA Nsight Compute for GPU profiling.
                  Refer https://developer.nvidia.com/tools-overview for more details.



In [13]:
!nvprof --help

Usage: nvprof [options] [application] [application-arguments]
Options:
       --aggregate-mode <on|off>
                        Turn on/off aggregate mode for events and metrics specified
                        by subsequent "--events" and "--metrics" options. Those
                        event/metric values will be collected for each domain instance,
                        instead of the whole device. Allowed values:
                        	on - turn on aggregate mode (default)
                        	off - turn off aggregate mode

       --analysis-metrics
                        Collect profiling data that can be imported to Visual Profiler's
                        "analysis" mode. Note: Use "--export-profile" to specify
                        an export file.

       --annotate-mpi <off|openmpi|mpich>
                        Automatically annotate MPI calls with NVTX markers. Specify
                        the MPI implementation installed on your machine. Currently,
        