In [3]:
import torch

import torch.nn as nn
import torch.optim as optim
import numpy as np

In [18]:
import random

# Random manual seed for consistency.
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.manual_seed(seed)
torch.backends.deterministic = True
torch.backends.benchmark = True

In [19]:
torch.__version__

'2.3.1+cu121'

In [20]:
torch.cuda.is_available()

True

In [21]:
l1 = nn.Linear(in_features=6, out_features=2)
l2 = nn.Linear(in_features=6, out_features=2)

In [22]:
l1.state_dict()

OrderedDict([('weight',
              tensor([[ 0.3121,  0.3388, -0.0956,  0.3750, -0.0894,  0.0824],
                      [-0.1988,  0.2398,  0.3599, -0.2995,  0.3548,  0.0764]])),
             ('bias', tensor([0.3016, 0.0553]))])

In [23]:
for param in l1.parameters():
    print(param)

Parameter containing:
tensor([[ 0.3121,  0.3388, -0.0956,  0.3750, -0.0894,  0.0824],
        [-0.1988,  0.2398,  0.3599, -0.2995,  0.3548,  0.0764]],
       requires_grad=True)
Parameter containing:
tensor([0.3016, 0.0553], requires_grad=True)


In [24]:
for param in l2.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1969, -0.0576,  0.3147,  0.0603, -0.1906,  0.1041],
        [-0.1881, -0.0479, -0.1658,  0.2708, -0.3223, -0.1882]],
       requires_grad=True)
Parameter containing:
tensor([-0.1153, -0.2455], requires_grad=True)


In [25]:
rand_ip = torch.rand(10, 6)
random_tar = torch.rand(10,2)

In [26]:
loss_l1 = nn.MSELoss()
loss_l2 = nn.MSELoss()

In [27]:
optimizer1 = optim.SGD(l1.parameters(), lr=1e-2)
optimizer2 = optim.SGD(l2.parameters(), lr=1e-2)

In [28]:
l1.train()
optimizer1.zero_grad()
preds = l1(rand_ip)
loss_1 = loss_l1(preds, random_tar)
loss_1.backward()
optimizer1.step()

In [29]:
for param in l1.parameters():
    print(param)

Parameter containing:
tensor([[ 0.3101,  0.3374, -0.0980,  0.3739, -0.0905,  0.0817],
        [-0.1989,  0.2395,  0.3598, -0.2993,  0.3548,  0.0764]],
       requires_grad=True)
Parameter containing:
tensor([0.2986, 0.0551], requires_grad=True)


In [30]:
for param in l2.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1969, -0.0576,  0.3147,  0.0603, -0.1906,  0.1041],
        [-0.1881, -0.0479, -0.1658,  0.2708, -0.3223, -0.1882]],
       requires_grad=True)
Parameter containing:
tensor([-0.1153, -0.2455], requires_grad=True)


In [15]:
l1.train()
optimizer1.zero_grad()
preds = l1(rand_ip)
loss_1 = loss_l1(preds, random_tar)
loss_1.backward()
optimizer2.step()

In [16]:
for param in l1.parameters():
    print(param)

Parameter containing:
tensor([[ 0.3121,  0.3388, -0.0956,  0.3750, -0.0894,  0.0824],
        [-0.1988,  0.2398,  0.3599, -0.2995,  0.3548,  0.0764]],
       requires_grad=True)
Parameter containing:
tensor([0.3016, 0.0553], requires_grad=True)


In [17]:
for param in l2.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1969, -0.0576,  0.3147,  0.0603, -0.1906,  0.1041],
        [-0.1881, -0.0479, -0.1658,  0.2708, -0.3223, -0.1882]],
       requires_grad=True)
Parameter containing:
tensor([-0.1153, -0.2455], requires_grad=True)


In [31]:
l2.weight.grad

In [32]:
l1.weight.grad

tensor([[ 2.0515e-01,  1.4023e-01,  2.3210e-01,  1.1071e-01,  1.0792e-01,
          6.7658e-02],
        [ 1.8303e-02,  2.2207e-02,  1.2788e-02, -2.0245e-02,  2.0396e-04,
          4.8681e-03]])