# 梯度裁剪

防止梯度过大

## norm: clac gradients norm, if total_norm > max_norm, then all gradients * (max_norm / total_norm)

```python
# 根据参数的范数来衡量
torch.nn.utils.clip_grad.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
```

## value: clamp all gradients between (min=-clip_value, max=clip_value)

```python
# 根据参数的大小来衡量
torch.nn.utils.clip_grad.clip_grad_value_(parameters=model.parameters(), clip_value=0.1)
```

In [36]:
import torch
from torch import nn, optim, Tensor

In [37]:
model = nn.Linear(1, 1)
model

Linear(in_features=1, out_features=1, bias=True)

In [38]:
optimizer = optim.SGD(params=model.parameters(), lr=10)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 10
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [39]:
loss_fn = nn.MSELoss()
loss_fn

MSELoss()

In [40]:
dict(model.state_dict())

{'weight': tensor([[0.7450]]), 'bias': tensor([-0.0779])}

In [41]:
for x, y in ((0., 1.), (1., 3.), (2., 5.)):
    x = torch.tensor(x).reshape(-1, 1)
    y = torch.tensor(y).reshape(-1, 1)

    optimizer.zero_grad()
    y_pred: Tensor = model(x)
    loss: Tensor = loss_fn(y, y_pred)
    loss.backward()
    # clip gradient by norm
    nn.utils.clip_grad.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0, norm_type=2)
    # clip gradient by value
    nn.utils.clip_grad.clip_grad_value_(parameters=model.parameters(), clip_value=0.01)
    optimizer.step()

In [42]:
dict(model.state_dict())

{'weight': tensor([[0.9450]]), 'bias': tensor([0.2221])}