# Chapter 9: Builders' Guide

## 1. Layers and Modules

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

A network with one fully connected hidden layer with 256 units and ReLU activation, followed by a fully connected output layer with 10 units.

In [2]:
net = nn.Sequential(nn.LazyLinear(256),
                    nn.ReLU(),
                    nn.LazyLinear(10))



In [3]:
X = torch.rand(2, 20)
X

tensor([[0.1714, 0.5922, 0.9716, 0.8877, 0.1616, 0.3221, 0.3684, 0.8593, 0.8961,
         0.2321, 0.4708, 0.0401, 0.8050, 0.2761, 0.1219, 0.8953, 0.7558, 0.3511,
         0.9674, 0.1824],
        [0.0176, 0.6110, 0.5584, 0.9284, 0.8360, 0.0197, 0.9225, 0.6392, 0.0234,
         0.0672, 0.4087, 0.3111, 0.4165, 0.8705, 0.3570, 0.0436, 0.7906, 0.1467,
         0.9183, 0.2950]])

In [4]:
net(X).shape

torch.Size([2, 10])

### Custom Module

The basic functionality that each module must provide:
1. Ingest input data as arguments to its forward propagation method.
2. Generate an output by having the forward propagation method return a value
3. Calculate the gradient of its output w.r.t its input. (automatic)
4. Store and provide access to those parameters necessary to execute the forward propagation computation
5. Initialize model parameters as needed

In [6]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)

    def forward(self, X):
        return self.out(F.relu(self.hidden(X))) # outputs its logits

In [7]:
net = MLP()
net(X).shape



torch.Size([2, 10])

## 2. Parameter Management

We sometimes want to extract parameters.

In [8]:
import torch
from torch import nn

In [10]:
net = nn.Sequential(nn.LazyLinear(8),
                    nn.ReLU(),
                    nn.LazyLinear(1))

In [11]:
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [12]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [14]:
# inspect second fc layer
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.0584, -0.1340, -0.0589,  0.0950, -0.1684,  0.3472, -0.2438, -0.2696]])),
             ('bias', tensor([-0.0610]))])

In [17]:
# bias
net[2].bias.data

tensor([-0.0610])

In [19]:
# get the gradeint
net[2].weight.grad # currently None cuz we have no invoked backprop

In [20]:
# all parameters at once
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

## 3. Parameter Initialization

In [21]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape



torch.Size([2, 1])

### Built-in Initialization

In [22]:
# initialize all weight parameters as gaussian random variables
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

In [23]:
net.apply(init_normal)
print(net[0].weight.data[0])
print(net[0].bias.data[0])

tensor([-0.0090, -0.0083,  0.0070, -0.0081])
tensor(0.)


In [24]:
# initialize all parameters to a given constant value
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

In [25]:
net.apply(init_constant)
print(net[0].weight.data[0])
print(net[0].bias.data[0])

tensor([1., 1., 1., 1.])
tensor(0.)


In [27]:
# xavier initializer for block 0 
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

# initialize another for block 2
def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

In [28]:
net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.6924, -0.2867, -0.6265,  0.2923])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


## 4. Custom Layers

### Layers with Parameters

In [31]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units))

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [32]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[-0.0602,  0.6252,  1.9651],
        [-0.1552,  0.0113,  0.7562],
        [ 0.0508,  0.5172, -1.1513],
        [-0.3747, -0.4529,  1.3648],
        [ 0.6436, -1.2735,  1.9681]], requires_grad=True)

## 5. File I/O

### Loading and Saving Tensors

In [34]:
x = torch.arange(4)
torch.save(x, 'x-file')

In [35]:
x2 = torch.load('x-file')
x2

tensor([0, 1, 2, 3])

In [36]:
# sotre a list of tensors
y = torch.zeros(4)
torch.save([x, y],'x-files')

x2, y2 = torch.load('x-files')
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

### Loading and Saving Model Parameters

We don't have to save individual weight vectors. We can save the entire model and it will give us access to these vectors. When we say saving the entire model, we are saving the model parameters.

In [2]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

In [3]:
net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)



In [5]:
X

tensor([[-0.4629,  2.4151, -0.1586, -1.7417, -0.4055,  0.9542, -0.6938,  0.8547,
          1.2549,  1.0926, -0.0708, -0.2871,  0.7541, -2.1041,  0.8322, -0.2614,
         -2.2919,  2.9135,  1.4602, -0.3889],
        [ 0.8625,  0.7499,  0.3026, -0.7247, -1.8654, -0.7768,  0.5060, -2.1754,
          0.5526, -0.0147, -0.4160, -0.7490, -0.5715, -0.6287, -0.9951, -0.2225,
         -1.6961, -0.5626,  0.7582, -1.0857]])

In [6]:
Y

tensor([[ 0.2728, -0.0819, -0.1610, -0.2981,  0.4970,  0.0867,  0.2954, -0.1085,
         -0.2120,  0.1068],
        [ 0.3364, -0.1034, -0.5188, -0.2197,  0.3971,  0.0862,  0.1626, -0.1668,
          0.0093,  0.0308]], grad_fn=<AddmmBackward0>)

In [7]:
# store parameters with name 'mlp.params'
torch.save(net.state_dict(), 'mlp.params')

In [8]:
# load
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()



MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

In [10]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

## 6. GPUs

In PyTorch, every array has a device. By default, all variables and associated computation have been assigned to the CPU. When we train a neural network on a server with GPU, we prefer the model's parameters to live on the GPU.

In [11]:
!nvidia-smi

Mon Apr 15 16:55:52 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 527.99       Driver Version: 527.99       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   38C    P8    11W /  40W |    887MiB /  6144MiB |      7%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
def cpu():
    return torch.device('cpu')

def gpu(i=0):
    return torch.device(f'cuda:{i}')

In [14]:
cpu()

device(type='cpu')

In [15]:
gpu()

device(type='cuda', index=0)

In [16]:
gpu(1)

device(type='cuda', index=1)

In [17]:
### Check the number of available GPUs
def num_gpus():
    return torch.cuda.device_count()

In [18]:
num_gpus()

1

### Tensors and GPUs

By default, all tensors are created on the CPU.

In [19]:
# check where is the tensor located
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

Whenever we want to operate on multiple terms, they need to be on the same device!

In [22]:
X = torch.ones(2, 3,device=gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

When we print tensors to the NumPy format, if the data is not in the main memory, the framework will copy it to the main memory first. This results in additional transmission overhead.

### Neural Networks and GPUs

As long as all the data and parameters are on the same device, we can learn models efficiently.

In [23]:
net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device=gpu())



In [24]:
net(X)

tensor([[-0.0787],
        [-0.0787]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [29]:
net[0].weight.data.device

device(type='cuda', index=0)