This notebook shows how the forward training and backward pass of an adaptive layernorm operation can be done using cudnn.

$$\text{Adaptive\_LayerNorm}(x) = \frac{x-\mu}{\sqrt{\sigma^2 + \epsilon}}\cdot\gamma+\beta$$
Where $\mu = E[x]$ and $\sigma^2 = Var[x]$ are taken over all inputs in a batch. $\gamma$ and $\beta$ are learnable parameters and varies for each input in a batch compared to the layernorm where $\gamma$ and $\beta$ are shared across all inputs in a batch.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/01_matmul_bias.ipynb)

## Prerequisites and Setup
This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected.

In [1]:
# get_ipython().system('nvidia-smi')

If running on Colab, you will need to install the cudnn python interface.

In [2]:
# get_ipython().system('pip install nvidia-cudnn-cu12')
# get_ipython().system('pip install nvidia-cudnn-frontend')
# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')

#### General Setup
Create a cudnn handle, which is a per device handle used to initialize cudnn context.

In [3]:
import cudnn
import torch
import torch.nn as nn
import sys

torch.manual_seed(1)
handle = cudnn.create_handle()

print("Running with cudnn backend version:", cudnn.backend_version())

assert torch.cuda.is_available()

Running with cudnn backend version: 90900


### LayerNorm Training
 Problem Sizes
- Batch Size: 4
- Sequence Size: 1024
- Embedding Dimension: 768

In [4]:
batch, seq_size, embedding_dim = 4, 1024, 768

input_type = torch.float16

# Epsilon is a small number to prevent division by 0.
epsilon_value = 1e-3

Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs.

In [5]:
# input tensor memory, initialize them to random numbers
x_gpu = torch.randn(
    batch,
    seq_size,
    embedding_dim,
    dtype=input_type,
    requires_grad=True,
    device="cuda",
)
scale_gpu = torch.randn(
    batch, 1, embedding_dim, dtype=input_type, requires_grad=True, device="cuda"
)
bias_gpu = torch.randn(
    batch, 1, embedding_dim, dtype=input_type, requires_grad=True, device="cuda"
)

# set epsilon to epsilon_value, allocate on cpu.
epsilon_cpu = torch.full(
    (1, 1, 1), epsilon_value, dtype=torch.float32, requires_grad=False, device="cpu"
)

Create reference computation and output tensor GPU buffers using PyTorch

In [6]:
# Define the AdaptiveLayerNorm class
class AdaptiveLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super(AdaptiveLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(
            normalized_shape, eps=eps, elementwise_affine=False
        )
        self.normalized_shape = normalized_shape

    def forward(self, x, gamma, beta):
        # Dimension checks
        if x.dim() != 3:
            raise ValueError(
                f"Expected input x to have 3 dimensions, but got {x.dim()} dimensions."
            )
        if gamma.dim() != 3 or beta.dim() != 3:
            raise ValueError(
                f"Expected gamma and beta to have 3 dimensions, but got gamma: {gamma.dim()} dimensions, beta: {beta.dim()} dimensions."
            )
        expected_shape = (x.size(0), 1, self.normalized_shape)
        if gamma.shape != expected_shape or beta.shape != expected_shape:
            raise ValueError(
                f"Expected gamma and beta to have shape {expected_shape}, but got gamma: {gamma.shape}, beta: {beta.shape}."
            )

        # Apply LayerNorm
        normalized_x = self.layer_norm(x)
        # Apply adaptive scaling and shifting, usually gamma and beta are based on the input x through some layer
        return gamma * normalized_x + beta


# Create the reference computation outputs here before the cuDNN computation, in order to use .empty_like() to create our output buffers
adaptive_layer_norm = AdaptiveLayerNorm(embedding_dim, eps=epsilon_value)
out_expected = adaptive_layer_norm(x_gpu, scale_gpu, bias_gpu)
mean_expected = x_gpu.to(torch.float32).mean(dim=(2), keepdim=True)
inv_var_expected = torch.rsqrt(
    torch.var(x_gpu.to(torch.float32), dim=(2), keepdim=True) + epsilon_value
)

# allocate output tensor memory using PyTorch
out_gpu = torch.empty_like(out_expected)
mean_gpu = torch.empty_like(mean_expected)
inv_var_gpu = torch.empty_like(inv_var_expected)
print(inv_var_gpu.shape)

torch.Size([4, 1024, 1])


#### Create cuDNN Foward Graph and tensors

In [7]:
# Create the cuDNN graph
fwd_graph = cudnn.pygraph(
    handle=handle,
    intermediate_data_type=cudnn.data_type.FLOAT,
    compute_data_type=cudnn.data_type.FLOAT,
)

# Create tensor handles with the fwd_graph
x = fwd_graph.tensor_like(x_gpu.detach()).set_name("X")
scale = fwd_graph.tensor_like(scale_gpu.detach()).set_name("scale")
bias = fwd_graph.tensor_like(bias_gpu.detach()).set_name("bias")
epsilon = fwd_graph.tensor_like(epsilon_cpu).set_name("epsilon")

# Add a layernorm operation
(out, mean, inv_var) = fwd_graph.adalayernorm(
    name="ADALN",
    input=x,
    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
    scale=scale,
    bias=bias,
    epsilon=epsilon,
)

# Enable all outputs
out.set_name("output").set_output(True).set_data_type(out_expected.dtype)
mean.set_name("mean").set_output(True).set_data_type(mean_expected.dtype)
inv_var.set_name("inv_var").set_output(True).set_data_type(inv_var_expected.dtype);

Validate and build the forward graph

In [8]:
# Build the fwd_graph
fwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])

# To run this block more than once, we need to re-run the previous block to get a new fwd_graph.
# The same instance of a fwd_graph should not be built twice.

Execute the forward graph

Instead of mapping UIDs to memory (as in 20_layernorm.ipynb), we can directly map handles to memory. This is simpler but slightly slower to execute.

In [9]:
# Mapping of (handles -> memory)
variant_pack = {
    x: x_gpu.detach(),
    scale: scale_gpu.detach(),
    bias: bias_gpu.detach(),
    epsilon: epsilon_cpu,
    out: out_gpu,
    mean: mean_gpu,
    inv_var: inv_var_gpu,
}

workspace = torch.empty(
    fwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8
)
fwd_graph.execute(variant_pack, workspace)

Test cuDNN's output against PyTorch's and check correctness

In [10]:
torch.cuda.synchronize()

# reference output
torch.testing.assert_close(out_gpu, out_expected, rtol=5e-3, atol=5e-3)
torch.testing.assert_close(inv_var_gpu, inv_var_expected, rtol=5e-3, atol=5e-3)
torch.testing.assert_close(mean_gpu, mean_expected, rtol=5e-3, atol=5e-3)

#### Adaptive LayerNorm Backward Pass

Compute references values for backward graph

In [11]:
# Reference backward operation using PyTorch
target = torch.randn_like(out_expected)
criterion = torch.nn.MSELoss()
loss = criterion(out_expected, target)

out_expected.retain_grad()
x_gpu.retain_grad()
scale_gpu.retain_grad()
bias_gpu.retain_grad()

loss.backward()

Build backward graph

In [12]:
bwd_graph = cudnn.pygraph(
    handle=handle,
    intermediate_data_type=cudnn.data_type.FLOAT,
    compute_data_type=cudnn.data_type.FLOAT,
)

# Create tensors associated with the backwards graph. DO NOT reuse tensor handles from the forward graph.
d_out = bwd_graph.tensor(
    name="d_out", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype
)

x_bwd = bwd_graph.tensor_like(x, name="x")
scale_bwd = bwd_graph.tensor_like(scale, name="scale")
mean_bwd = bwd_graph.tensor_like(mean, name="mean")
inv_var_bwd = bwd_graph.tensor_like(inv_var, name="inv_var")

# Add the adaptive layernorm backward operation
(d_x, d_scale, d_bias) = bwd_graph.adalayernorm_backward(
    name="DADALN",
    grad=d_out,
    input=x_bwd,
    scale=scale_bwd,
    mean=mean_bwd,
    inv_variance=inv_var_bwd,
)

# Enable outputs.
d_x.set_output(True).set_data_type(x_gpu.dtype)
d_scale.set_output(True).set_data_type(x_gpu.dtype)
d_bias.set_output(True).set_data_type(x_gpu.dtype)

[{"data_type":"HALF","dim":[],"is_pass_by_value":false,"is_virtual":false,"name":"DADALN::DBIAS","pass_by_value":null,"reordering_type":"NONE","stride":[],"uid":0,"uid_assigned":false}]

In [13]:
# Build the bwd_graph
bwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])

Execute the graph and check correctness against PyTorch

In [14]:
# Create output buffers for gradients
d_x_gpu = torch.empty_like(x_gpu)
d_scale_gpu = torch.empty_like(scale_gpu)
d_bias_gpu = torch.empty_like(bias_gpu)

workspace = torch.empty(
    bwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8
)

# For the inputs of the backwards graph (x_bwd, d_out, scale_bwd, mean_bwd, inv_var_bwd), we use the outputs of the forwards graph. For d_out we use pytorches autograd .grad functionality.
bwd_graph.execute(
    {
        x_bwd: x_gpu.detach(),
        scale_bwd: scale_gpu.detach(),
        d_out: out_expected.grad,
        mean_bwd: mean_gpu.detach(),
        inv_var_bwd: inv_var_gpu.detach(),
        d_x: d_x_gpu,
        d_scale: d_scale_gpu,
        d_bias: d_bias_gpu,
    },
    workspace,
    handle=handle,
)

Compare results and check correctness

In [15]:
torch.cuda.synchronize()

# compare to reference output
torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)
torch.testing.assert_close(scale_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)
torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)

Perform Cleanup

In [16]:
cudnn.destroy_handle(handle)