# 1 Model Construction
## 1.1 Module class

In [1]:
import torch
from torch import nn

class MLP(nn.Module):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [2]:
net = MLP()
print(net)

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


## 1.2 Implement a class that has the same function as the Sequential class 

In [3]:
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            # enumerate returns an iterable object with index starting from 0
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    def forward(self, x):
        # self._modules returns an OrderedDict 
        # .values() returns the values from key-value pairs
        for module in self._modules.values():
            x = module(x)
        return x

In [4]:
net = MySequential(
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10)
)
print(net)


MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


## 1.3 Difference between `Sequential` and `ModuleList`
`Sequential` and `ModuleList` both can be used to save a network consisted of multiple layers like a list. 

`Sequential` can be used to build a network sequentially and calculate the output according to the input. So the shape of output and input of adjacent layers must be Compatible. `forward()` is defined automatically.

`ModuleList` can NOT be used to calculate the output. The aim of `ModuleList`is only to save a list of layers. We need to define `forward()` by ourselves.

In [5]:
class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            x = self.linears[i // 2](x) + l(x)
        return x

## 1.4 `ModuleDict`
We also need to define `forward()` by ourselves.

In [6]:
net = nn.ModuleDict({
    "linear": nn.Linear(784, 256),
    "act": nn.ReLU(),
})
net["output"] = nn.Linear(256, 10)
print(net.linear)
print(net["output"])
print(net)

Linear(in_features=784, out_features=256, bias=True)
Linear(in_features=256, out_features=10, bias=True)
ModuleDict(
  (act): ReLU()
  (linear): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)


# 2 Parameter Management

In [7]:
import torch
from torch import nn
from torch.nn import init

net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认初始化

print(net)

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


## 2.1 Parameter Access

In [8]:
# Access to parameters of all layers
print(type(net.named_parameters()))
for param in net.parameters():
    print(param.size())
print()
for name, param in net.named_parameters():
    print(name, param.size())

<class 'generator'>
torch.Size([3, 4])
torch.Size([3])
torch.Size([1, 3])
torch.Size([1])

0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [9]:
# Access to parameters of one specific layer
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>


torch.nn.Paramter is a subclass of Tensor. All Parameter objects will be automatically saved into net.named_parmaters()

## 2.2 Parameter Initialization

In [10]:
from torch.nn import init

for name, param in net.named_parameters():
    if "weight" in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)
    if "bias" in name:
        init.constant_(param, val=0)
        print(name, param.data)

0.weight tensor([[-0.0162, -0.0005, -0.0259, -0.0084],
        [-0.0125, -0.0123,  0.0036,  0.0029],
        [-0.0108, -0.0059,  0.0003,  0.0111]])
0.bias tensor([0., 0., 0.])
2.weight tensor([[ 0.0044, -0.0031,  0.0052]])
2.bias tensor([0.])


## 2.3 Custom Initialization
Note: Parameter initialization should NOT be tracked. 2 Methods:
- `with torch.no_grad():`
- using `param.data`
### 2.3.1 Implement a custom `init.normal_`

In [11]:
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(tensor, mean, std)

### 2.3.2 Implement a custom probability distribution
We have half the probability of initializing the weight to 0, and the other half of the probability of initializing the weight as uniformly distributed random numbers in the two intervals $[−10,−5]$ and $[5,10]$. 

And we set bias as 1

In [12]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if "weight" in name:
        init_weight_(param)
        print(name, param.data)

0.weight tensor([[ 0.0000, -0.0000, -5.5244,  0.0000],
        [ 0.0000, -6.3243, -0.0000,  8.9183],
        [ 0.0000,  0.0000,  0.0000, -6.6984]])
2.weight tensor([[-5.9246,  0.0000,  7.7134]])


In [13]:
for name, param in net.named_parameters():
    if "bias" in name:
        param.data += 1 
        print(name, param.data)

0.bias tensor([1., 1., 1.])
2.bias tensor([1.])


## 2.4 Tied Parameters
Share parameters across multiple layers

$h = w_1x,\ y = w_2h = w_1w_2x , dy/dw_2 = h,\ dh/dw_1 = x,\ dy/dw_1 = w_2x = h$
- Sharing Parameters: `net[0].weight.grad = h + h`
- Not Sharing Parameters:  `net[0].weight.grad = h, net[1].weight.grad = h`

In [14]:
linear = nn.Linear(1, 1, bias=False)
linear2 = nn.Linear(1, 1, bias=False)
linear3 = nn.Linear(1, 1, bias=False)
net_shared = nn.Sequential(linear, linear)
net_unshared = nn.Sequential(linear2, linear3)
print(net_shared)

# network sharing parameters
print("Sharing Parameters: ")
for name, param in net_shared.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)
    

print("Not Sharing Parameters: ")
# network not sharing parameters
for name, param in net_unshared.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
Sharing Parameters: 
0.weight tensor([[3.]])
Not Sharing Parameters: 
0.weight tensor([[3.]])
1.weight tensor([[3.]])


In [15]:
# The two layers are actually at the same position in memory
print(id(net_shared[0]) == id(net_shared[1]))
print(id(net_shared[0].weight) == id(net_shared[1].weight))

print(id(net_unshared[0]) == id(net_unshared[1]))
print(id(net_unshared[0].weight) == id(net_unshared[1].weight))


True
True
False
False


In [16]:
# Also the backpropagation will be accumulated
x1 = torch.ones((1, 1), requires_grad=True)
x2 = torch.ones((1, 1), requires_grad=True)
y_shared = net_shared(x1).sum()
y_shared.backward()
y_unshared = net_unshared(x2).sum()
y_unshared.backward()

print(net_shared[0].weight.grad, net_shared[1].weight.grad)
print(net_unshared[0].weight.grad, net_unshared[1].weight.grad)

tensor([[6.]]) tensor([[6.]])
tensor([[3.]]) tensor([[3.]])


# 3 Custom layers
## 3.1 Layers without Parameters
A layer that subtract the mean from the input

In [17]:
import torch
from torch import nn

class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()

layer = CenteredLayer()
layer(torch.tensor([1,2,3,4,5], dtype=torch.float))

tensor([-2., -1.,  0.,  1.,  2.])

## 3.2 Layers with Parameters
We can use `ParameterList` and `ParameterDict` to define the parameters we need.

In [18]:
class MyDense(nn.Module):
    def __init__(self):
        super(MyDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4, 4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4, 1)))
    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x

net = MyDense()
print(net)

MyDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x1]
  )
)


In [19]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
            "0": nn.Parameter(torch.randn(4, 4)),
            "1": nn.Parameter(torch.randn(4, 4)),
            "2": nn.Parameter(torch.randn(4, 4))
        })
        self.params.update({"3": 
            nn.Parameter(torch.randn(4, 1))})
    def forward(self, x):
        for i in self.params.keys():
            x = torch.mm(x, self.params[i])
        return x

net = MyDictDense()
print(net)

MyDictDense(
  (params): ParameterDict(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x1]
  )
)


# 4 File I/O
## 4.1 Read/Write Tensor

In [20]:
import torch
from torch import nn

# Write
x = torch.ones(3)
torch.save(x, "./data/FileIO/x.pt")

In [21]:
# Read
x2 = torch.load("./data/FileIO/x.pt")
x2

tensor([1., 1., 1.])

In [22]:
# Save multiple tensors at the same time with list
x  = torch.ones(3)
y = torch.zeros(4)
torch.save([x, y], "./data/FileIO/xy.pt")
xy_list = torch.load("./data/FileIO/xy.pt")
xy_list

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [23]:
# Save multiple tensors at the same time with Dictionary
torch.save({"x": x, "y": y}, "./data/FileIO/xy_dict.pt")
xy = torch.load("./data/FileIO/xy_dict.pt")
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

## 4.2 Read/Write Model
### 4.2.1 Transform
`state_dict()` can be used to describe the **network** with a Dictionary with name of parameters as keys and data of parameters(Tensor) as values

In [24]:
# transform the model with Dictionary
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)
    
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)
net = MLP()
for param in net.parameters():
    torch.nn.init.constant_(param, val=1)
net.state_dict()

OrderedDict([('hidden.weight',
              tensor([[1., 1., 1.],
                      [1., 1., 1.]])),
             ('hidden.bias', tensor([1., 1.])),
             ('output.weight', tensor([[1., 1.]])),
             ('output.bias', tensor([1.]))])

In [25]:
# transform the optimizer with Dictionary
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [140621300908392,
    140621300789536,
    140621300910048,
    140621300909976]}]}

### 4.2.2 Save and Load
Two methods to save and load:
- with `state_dict()`: 
    ```
    torch.save(model.state_dict(), PATH)
    model = TheModelClass(*args, **kwargs)
    model.load_state_dict(torch.load(PATH))
    ```
- save directly
    ```
    torch.save(model, PATH)
    model = torch.load(PATH)
    ```

In [26]:
# Save
torch.save(net.state_dict(), "./data/FileIO/net.pt")
torch.save(optimizer, "./data/FileIO/optim.pt") #optimizer can not be loaded with `state_dict()`

In [27]:
# Load
net2 = MLP()
net2.load_state_dict(torch.load("./data/FileIO/net.pt"))

optimizer2 = torch.load("./data/FileIO/optim.pt")

for param in net2.parameters():
    print(param)

print(optimizer2)

Parameter containing:
tensor([[1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)
Parameter containing:
tensor([1., 1.], requires_grad=True)
Parameter containing:
tensor([[1., 1.]], requires_grad=True)
Parameter containing:
tensor([1.], requires_grad=True)
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0.9
    nesterov: False
    weight_decay: 0
)


# 5 Use GPU
## 5.1 Device

In [28]:
# check whether GPU is avaliable
import torch
from torch import nn
torch.cuda.is_available()


True

In [29]:
# check how many GPU we have
torch.cuda.device_count()

1

In [30]:
# check index of current GPU
torch.cuda.current_device()

0

In [31]:
# check the name of our GPU with the index
#torch.cuda.get_device_name(0)

## 5.2 Tensor on GPU

In [32]:
# Tensor will be saved on memory(CPU) by default
x = torch.tensor([1, 2, 3])
print(x)
print(x.device)

tensor([1, 2, 3])
cpu


In [33]:
# `cuda(i)` can be used to copy an object to GPU, i means the index of GPU
x = x.cuda(0)
print(x)
print(x.device)

tensor([1, 2, 3], device='cuda:0')
cuda:0


In [34]:
# Or specify the device directly by creation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x = torch.tensor([1, 2, 3], device=device)
# or
y = torch.tensor([1, 2, 3]).to(device)
x, y

(tensor([1, 2, 3], device='cuda:0'), tensor([1, 2, 3], device='cuda:0'))

In [35]:
# Tensor on GPU will be calculated on  GPU
z = x ** 2
z

tensor([1, 4, 9], device='cuda:0')

In [36]:
# Tensors on different devices can not be calculated together
z = y + x.cpu()

RuntimeError: expected device cuda:0 and dtype Long but got device cpu and dtype Long

## 5.3 Model on GPU

In [37]:
# Model is on cpu by default
net = nn.Linear(3, 1)
list(net.parameters())[0].device

device(type='cpu')

In [38]:
# Use `cuda()` to transfer model to GPU
net.cuda()
list(net.parameters())[0].device

device(type='cuda', index=0)

In [39]:
x = torch.rand(2,3).cuda()
net(x)

tensor([[0.2200],
        [0.1605]], device='cuda:0', grad_fn=<AddmmBackward>)