In [3]:
#性能提升
from mxnet import nd,sym
from mxnet.gluon import nn
import time

#符号式编程
#符号式编程通常在计算流程完全定义好后才被执⾏
#使用HybridBlock 进行优化
def get_net():
    net = nn.HybridSequential()
    net.add(nn.Dense(256,activation='relu'),
           nn.Dense(128,activation='relu'),
           nn.Dense(2))
    net.initialize()
    return net
x = nd.random.normal(shape=(1,512))
net = get_net()
net(x)


[[-0.00648382  0.00534949]]
<NDArray 1x2 @cpu(0)>

In [4]:
#通过符号式编程进行优化
net.hybridize()
net(x)


[[-0.00648382  0.00534949]]
<NDArray 1x2 @cpu(0)>

In [8]:
def benchmark(net,x):
    start =time.time()
    for i in range(1000):
        _ = net(x)
    nd.waitall()  
    return time.time() - start
net = get_net()
print('before hybridizing: %.4f sec' % (benchmark(net, x)))
net.hybridize()
print('after hybridizing: %.4f sec' % (benchmark(net, x)))

before hybridizing: 0.3013 sec
after hybridizing: 0.1825 sec


In [None]:
#获取参数
net.export('my_mlp')

In [None]:
#输⼊⼀个Symbol类型的变量，net(x)会返回Symbol类型的结果。
x = sym.var('data')
net(x)

In [13]:
from mxnet import nd,sym
from mxnet.gluon import nn
import time

class HybridNet(nn.HybridBlock):
    def __init__(self,**kwargs):
        super(HybridNet,self).__init__(**kwargs)
        self.hidden = nn.Dense(10)
        self.output = nn.Dense(2)
    
    def hybrid_forward(self,F,x):
        print('F:',F)
        print('x:',x)
        x = F.relu(self.hidden(x))    #激活
        print('hidden:' ,x)
        return self.output(x)
    

In [14]:
net = HybridNet()
net.initialize()
x = nd.random.normal(shape=(1,4))
net(x)

F: <module 'mxnet.ndarray' from 'D:\\anaconda\\Anaconda3\\lib\\site-packages\\mxnet\\ndarray\\__init__.py'>
x: 
[[-0.9130886  -0.02453975  0.40365326  0.60531294]]
<NDArray 1x4 @cpu(0)>
hidden: 
[[0.06474365 0.05111783 0.         0.07812573 0.         0.
  0.         0.         0.02673142 0.0459147 ]]
<NDArray 1x10 @cpu(0)>



[[ 0.00401807 -0.0040347 ]]
<NDArray 1x2 @cpu(0)>

In [15]:
net(x)

F: <module 'mxnet.ndarray' from 'D:\\anaconda\\Anaconda3\\lib\\site-packages\\mxnet\\ndarray\\__init__.py'>
x: 
[[-0.9130886  -0.02453975  0.40365326  0.60531294]]
<NDArray 1x4 @cpu(0)>
hidden: 
[[0.06474365 0.05111783 0.         0.07812573 0.         0.
  0.         0.         0.02673142 0.0459147 ]]
<NDArray 1x10 @cpu(0)>



[[ 0.00401807 -0.0040347 ]]
<NDArray 1x2 @cpu(0)>

In [16]:
net.hybridize()
net(x)

F: <module 'mxnet.symbol' from 'D:\\anaconda\\Anaconda3\\lib\\site-packages\\mxnet\\symbol\\__init__.py'>
x: <Symbol data>
hidden: <Symbol hybridnet2_relu0>



[[ 0.00401807 -0.0040347 ]]
<NDArray 1x2 @cpu(0)>

In [17]:
#异步运算
from mxnet import autograd,gluon ,nd
from mxnet.gluon import loss as gloss,nn
import os
import subprocess
import time

In [18]:
class Benchmark(): # 本类已保存在d2lzh包中⽅便以后使⽤
    def __init__(self, prefix=None):
        self.prefix = prefix + ' ' if prefix else ''
    def __enter__(self):
        self.start = time.time()
    def __exit__(self, *args):
        print('%stime: %.4f sec' % (self.prefix, time.time() - self.start))

In [20]:
with Benchmark('workloads are queued.'):
    x = nd.random.uniform(shape = (2000,2000))
    y = nd.dot(x,x).sum()

with Benchmark('workloads are finished'):
    print('sum = ',y)

workloads are queued. time: 0.1064 sec
sum =  
[1.9997107e+09]
<NDArray 1 @cpu(0)>
workloads are finished time: 0.1143 sec


In [21]:
#使用 wait_to_read()函数进行等待前端计算
#的wait_to_read函数、waitall函数、asnumpy函数、asscalar函数和print函数
#会触发让前端等待后端计算结果的⾏为。这类函数通常称为同步函数。
with Benchmark():
    y = nd.dot(x, x)
    y.wait_to_read()


time: 0.1339 sec


In [23]:
with Benchmark():
    y = nd.dot(x, x)
    z = nd.dot(x, x)
    nd.waitall()

time: 0.2634 sec


In [24]:
with Benchmark():
    y = nd.dot(x, x)
    y.asnumpy()


time: 0.1578 sec


In [25]:
with Benchmark():
    y = nd.dot(x, x)
    y.asnumpy()


time: 0.1505 sec


In [41]:
with Benchmark('synchronous.'):
    for _ in range(1000):
        y = x+1
        y.wait_to_read()

with Benchmark('asynchronous.'):
    for _ in range(100):
         y = x+1
    nd.waitall()

synchronous. time: 2.9804 sec
asynchronous. time: 0.4260 sec


In [42]:
#数据调取 调取时进行计时
def data_iter():
    start = time.time()
    num_batches, batch_size = 100, 1024
    for i in range(num_batches):
        X = nd.random.normal(shape=(batch_size, 512))
        y = nd.ones((batch_size,))
        yield X, y
        if (i + 1) % 50 == 0:
            print('batch %d, time %f sec' % (i + 1, time.time() - start))

In [43]:
net= nn.Sequential()
net.add(nn.Dense(2048,activation='relu'),
       nn.Dense(512,activation='relu'),
       nn.Dense(1))
net.initialize()
trainer = gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':0.005})
loss= gloss.L2Loss()

In [44]:
def get_mem():
#     res = subprocess.check_output(['ps', 'u', '-p', str(os.getpid())])
#     return int(str(res).split()[15]) / 1e3
    return 1

In [45]:
#初始化net参数
for X, y in data_iter():
    break
loss(y, net(X)).wait_to_read()


In [49]:
l_sum, mem = 0, get_mem()
for X, y in data_iter():
    with autograd.record():
        l = loss(y, net(X))
#     l_sum += l.mean().asscalar() # 使⽤同步函数asscalar
    l.backward()
    trainer.step(X.shape[0])
nd.waitall()
print('increased memory: %f MB' % (get_mem() - mem))
    

batch 50, time 0.102590 sec
batch 100, time 0.208885 sec
increased memory: 0.000000 MB


In [48]:
#异步
for X,y in data_iter():
    with autograd.record():
        l = loss(y,net(X))
    l.backward()
    trainer.step(X.shape[0])
nd.waitall()


batch 50, time 0.090140 sec
batch 100, time 0.190224 sec


In [50]:
#并行计算
import d2lzh as d2l
import mxnet as mx
from mxnet import nd

def run(x):
    return [nd.dot(x,x) for _ in range(10)]


In [51]:
x_cpu = nd.random.uniform(shape = (2000,2000))
x_gpu = nd.random.uniform(shape = (6000,6000),ctx=mx.gpu(0))

In [52]:
run(x_cpu)
run(x_gpu)
nd.waitall()

with d2l.Benchmark('Run on CPU.'):
    run(x_cpu)
    nd.waitall()

with d2l.Benchmark('Then run on GPU.'):
    run(x_gpu)
    nd.waitall()

Run on CPU. time: 1.7137 sec
Then run on GPU. time: 2.2291 sec


In [53]:
#并行运行
with d2l.Benchmark('Run on both CPU and GPU in parallel.'):
    run(x_cpu)
    run(x_gpu)
    nd.waitall()

Run on both CPU and GPU in parallel. time: 2.5091 sec


In [55]:
import d2lzh as d2l
import mxnet as mx
from mxnet import autograd, nd
from mxnet.gluon import loss as gloss
import time

In [None]:
# 初始化模型参数
scale = 0.01
W1 = nd.random.normal(scale=scale, shape=(20, 1, 3, 3))
b1 = nd.zeros(shape=20)
W2 = nd.random.normal(scale=scale, shape=(50, 20, 5, 5))
b2 = nd.zeros(shape=50)
W3 = nd.random.normal(scale=scale, shape=(800, 128))
b3 = nd.zeros(shape=128)
W4 = nd.random.normal(scale=scale, shape=(128, 10))
b4 = nd.zeros(shape=10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]