# 初始化模型参数

用mlp这个例子初始化模型参数

In [2]:
from mxnet.gluon import nn
from mxnet import nd

def get_net():
    net = nn.Sequential()
    with net.name_scope():
        net.add(nn.Dense(4, activation='relu'))
        net.add(nn.Dense(2))
    return net

x = nd.random.uniform(shape=(3,5))

如果不进行初始化，系统会报错，需要初始化

In [3]:
import sys
try:
    net = get_net()
    net(x)
except RuntimeError as err:
    sys.stderr.write(str(err))

Parameter 'sequential0_dense0_bias' has not been initialized. Note that you should initialize parameters and create Trainer with Block.collect_params() instead of Block.params because the later does not include Parameters of nested child Blocks

In [4]:
net

Sequential(
  (0): Dense(None -> 4, Activation(relu))
  (1): Dense(None -> 2, linear)
)

In [5]:
net.initialize()
net(x)


[[0.00212593 0.00365805]
 [0.00161272 0.00441845]
 [0.00204872 0.00352518]]
<NDArray 3x2 @cpu(0)>

## 访问模型参数

之前我们提到过可以通过`weight`和`bias`访问`Dense`，他们是`Parameter`这个类：

In [6]:
w = net[0].weight
b = net[0].bias
print 'name: ', net[0].name, '\nweight: ', w, '\nbias: ', b

name:  sequential0_dense0 
weight:  Parameter sequential0_dense0_weight (shape=(4L, 5L), dtype=float32) 
bias:  Parameter sequential0_dense0_bias (shape=(4L,), dtype=float32)


然后通过`data`访问参数， `grad`访问对应的梯度

In [7]:
print 'weight: ', w.data()
print 'weight gradient: ',w.grad()
print 'bias: ', b.data()
print 'bias gradient: ', b.grad()

weight:  
[[-0.06206018  0.06491279 -0.03182812 -0.01631819 -0.00312688]
 [ 0.0408415   0.04370362  0.00404529 -0.0028032   0.00952624]
 [-0.01501013  0.05958354  0.04705103 -0.06005495 -0.02276454]
 [-0.0578019   0.02074406 -0.06716943 -0.01844618  0.04656678]]
<NDArray 4x5 @cpu(0)>
weight gradient:  
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
<NDArray 4x5 @cpu(0)>
bias:  
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>
bias gradient:  
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


我们可以通过`collect_params`来访问`Block`中的所有参数（会包含所有的子Block）。它会返回一个名字到对应的Parameter的dict。可以用正常`[]`，也可以用`get()`，get方法不需要填名字前缀。

In [8]:
params = net.collect_params()
print(params)
print(params['sequential0_dense0_bias'].data())
print(params.get('dense0_weight').data())

sequential0_ (
  Parameter sequential0_dense0_weight (shape=(4L, 5L), dtype=float32)
  Parameter sequential0_dense0_bias (shape=(4L,), dtype=float32)
  Parameter sequential0_dense1_weight (shape=(2L, 4L), dtype=float32)
  Parameter sequential0_dense1_bias (shape=(2L,), dtype=float32)
)

[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>

[[-0.06206018  0.06491279 -0.03182812 -0.01631819 -0.00312688]
 [ 0.0408415   0.04370362  0.00404529 -0.0028032   0.00952624]
 [-0.01501013  0.05958354  0.04705103 -0.06005495 -0.02276454]
 [-0.0578019   0.02074406 -0.06716943 -0.01844618  0.04656678]]
<NDArray 4x5 @cpu(0)>


## 使用不同的初始化函数初始化

我们一直使用默认的`initialize`进行初始化权重。它会对所有权重初始化为`[-0.07,0.07]`之间进行均匀分布的随机数。我们可以采用别的初始化方法，比如均值为0，方差为0.02的正态分布

In [9]:
from mxnet import init
params.initialize(init=init.Normal(sigma=0.02), force_reinit=True)
print(net[0].weight.data(), net[0].bias.data())

(
[[ 0.02804598  0.00220872  0.00701151  0.02721515  0.00500832]
 [ 0.00112992  0.03227538 -0.01813176 -0.00385197 -0.01286032]
 [ 0.03360647 -0.02855298 -0.03083278 -0.02110904 -0.02623655]
 [-0.00293494  0.01282986 -0.01476416  0.04062728  0.01186533]]
<NDArray 4x5 @cpu(0)>, 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>)


全一函数的初始化

In [10]:
params.initialize(init=init.One(), force_reinit=True)
print(net[0].weight.data(), net[0].bias.data())

(
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
<NDArray 4x5 @cpu(0)>, 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>)


自定义一个初始化方法

In [19]:
class Myinit(init.Initializer):
    def __init__(self):
        super(Myinit, self).__init__()
        self._verbose = True
    def _init_weight(self, _, arr):
        print 'init_weight: ', arr.shape        
        nd.random.uniform(low=5, high=10, out=arr)
    def _init_bias(self, _, arr):
        arr[:] = 2
        print 'init_bias: ', arr.shape
        
        
params.initialize(init=Myinit(), force_reinit=True)
print net[0].weight.data()
print net[0].bias.data()

init_weight:  (4L, 5L)
init_weight:  (2L, 4L)

[[6.8274803 9.877607  5.08314   9.2790165 6.153712 ]
 [5.0585704 8.824558  6.7998905 9.720617  8.649953 ]
 [8.749996  5.8581486 6.697019  7.605183  7.447745 ]
 [5.27169   6.6949253 5.999983  5.8974514 5.092609 ]]
<NDArray 4x5 @cpu(0)>

[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


## 延后初始化

Gluon的优势在于不需要指定输入大小，在之后forward就会自动推测参数大小

如下：

In [20]:
net = get_net()
print(net.collect_params())

sequential1_ (
  Parameter sequential1_dense0_weight (shape=(4, 0), dtype=float32)
  Parameter sequential1_dense0_bias (shape=(4,), dtype=float32)
  Parameter sequential1_dense1_weight (shape=(2, 0), dtype=float32)
  Parameter sequential1_dense1_bias (shape=(2,), dtype=float32)
)


然后我们初始化,并没有看到Myinit打印的东西，由于我们仍然不知道形状。真正的初始化会在我们看到数据

In [22]:
net.initialize(init=Myinit())

In [23]:
net(x)

init_weight:  (4L, 5L)
init_weight:  (2L, 4L)



[[864.09644 684.8019 ]
 [871.0519  691.282  ]
 [696.2539  554.8877 ]]
<NDArray 3x2 @cpu(0)>

可以看到已经有参数 输入数了

In [24]:
print(net.collect_params())

sequential1_ (
  Parameter sequential1_dense0_weight (shape=(4L, 5L), dtype=float32)
  Parameter sequential1_dense0_bias (shape=(4L,), dtype=float32)
  Parameter sequential1_dense1_weight (shape=(2L, 4L), dtype=float32)
  Parameter sequential1_dense1_bias (shape=(2L,), dtype=float32)
)


## 避免延后初始化

有时候我们不想延后初始化，这时候需要创建网络的时候指定输入大小

In [26]:
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(4, in_units=5, activation='relu'))
    net.add(nn.Dense(2, in_units=4))
    
net.initialize(Myinit())

init_weight:  (4L, 5L)
init_weight:  (2L, 4L)


## 共享模型参数

有时候我们想在层之间共享同一份参数，我们可以通过Block的`params`输出参数手动指定参数，而不是让系统自动生成。

In [32]:
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(4, in_units=5, activation='relu'))
    net.add(nn.Dense(4, in_units=5, activation='relu', params=net[-1].params))
    net.add(nn.Dense(2, in_units=4))
    
net.initialize(Myinit())
print net
print net[0].weight.data()
print net[1].weight.data()

init_weight:  (4L, 5L)
init_weight:  (2L, 4L)
Sequential(
  (0): Dense(5 -> 4, Activation(relu))
  (1): Dense(5 -> 4, Activation(relu))
  (2): Dense(4 -> 2, linear)
)

[[7.598556  8.144909  5.0002766 9.363254  6.5593014]
 [6.36771   7.127258  8.990234  9.426688  5.9281797]
 [8.399397  9.763958  7.2806487 8.437441  7.417043 ]
 [6.0775385 8.943697  9.736853  6.147209  8.65428  ]]
<NDArray 4x5 @cpu(0)>

[[7.598556  8.144909  5.0002766 9.363254  6.5593014]
 [6.36771   7.127258  8.990234  9.426688  5.9281797]
 [8.399397  9.763958  7.2806487 8.437441  7.417043 ]
 [6.0775385 8.943697  9.736853  6.147209  8.65428  ]]
<NDArray 4x5 @cpu(0)>


In [31]:
net.params

sequential5_ (

)