<a href="https://colab.research.google.com/github/Mustafa-AbdulRazek/NLP-Transformers/blob/master/001_pytorch_AutoGrad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torchvision.models import resnet18, ResNet18_Weights


In [2]:
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 83.4MB/s]


In [3]:
prediction = model(data)

In [4]:
loss = (prediction - labels).sum()
loss.backward()

In [5]:
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [6]:
optim.step()

In [7]:
# Differentiation in Autograd

In [8]:
# create two tensors just to take a look at how autograd collects gradients.

a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

In [9]:
# then create another tensor Q from a and b.
# Q = 3a^3 = b^2

Q = 3*a**3 - b**2

In [10]:
# Assume a and b to be params of an NN, and Q to be the error.
# In NN training, we want gradients of the error.

external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

In [11]:
# check if collected gradients are correct

print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [12]:
x = torch.rand(5, 5)
x

tensor([[0.3421, 0.6238, 0.4024, 0.2161, 0.7112],
        [0.8367, 0.1234, 0.0099, 0.3951, 0.8252],
        [0.1315, 0.6146, 0.4437, 0.7350, 0.7663],
        [0.0560, 0.5275, 0.0302, 0.4980, 0.0032],
        [0.7017, 0.5863, 0.7491, 0.1555, 0.7265]])

In [13]:
x = torch.rand(5, 5)
y = torch.rand(5, 5)
z = torch.rand((5, 5), requires_grad=True)

a = x + y
print(f"Does `a` require gradients? : {a.requires_grad}")
b = x + z
print(f"Does `b` require gradients?: {b.requires_grad}")

Does `a` require gradients? : False
Does `b` require gradients?: True


In [14]:
# lets freeze resnet params, e.g. for fine-tuning for example.

from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)

# freeze all params in the Net
for param in model.parameters():
    param.requires_grad = False

In [15]:
# fine-tune the model on 10 labels
# in resnet, the clf is the last linear layer model.fc

model.fc = nn.Linear(512, 10)

# now all params of the model are frozen,
# the only params that compute gradients are weights and bias of model.fc

In [17]:
# optimize only the classifier

optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)


# Notice although we register all the parameters in the optimizer,
# the only parameters that are computing gradients
# (and hence updated in gradient descent) are the weights and bias of the classifier.