# MXNET 

mix-net, combines (mixes) both worlds, imperative and declarative programming. The gluon api is extremely friendly to use, and production ready. 

In [1]:
# pip3 install --no-cache-dir --pre mxnet -f https://dist.mxnet.io/python

In [2]:
import mxnet as mx 
mx.npx.set_np()

### Resources 
#### Tutorials
    1. https://gluon.mxnet.io/index.html
    2. http://d2l.ai/
    3. https://mxnet.incubator.apache.org/versions/master/tutorials/index.html 

#### Awesome mxnet
    1. https://github.com/chinakook/Awesome-MXNet

#### Community (forum)
    1. https://discuss.mxnet.io/
#### API Reference
    1. https://mxnet.incubator.apache.org/versions/master/api/python/docs/api/

## Array creation routines

In [3]:
mx.np.array(((1,2,3),(5,6,7)))

array([[1., 2., 3.],
       [5., 6., 7.]])

In [4]:
x = mx.np.ones((2,3))
print (x)

[[1. 1. 1.]
 [1. 1. 1.]]


In [5]:
y = mx.np.random.randint(low=-1,high=1,size=(2,3))
#y = nd.random.uniform(-1,1,(2,3))
print(y)

[[ 0 -1  0]
 [ 0  0 -1]]


In [6]:
x = mx.np.full((2,3), 2.0)
print(x)

[[2. 2. 2.]
 [2. 2. 2.]]


In [7]:
x.shape, x.size, x.dtype, x.context

((2, 3), 6, dtype('float32'), cpu(0))

## Operations

In [8]:
x*y

array([[ 0., -2.,  0.],
       [ 0.,  0., -2.]])

In [9]:
x+y

array([[2., 1., 2.],
       [2., 2., 1.]])

In [10]:
mx.np.exp(y)

array([[1.        , 0.36787945, 1.        ],
       [1.        , 1.        , 0.36787945]])

## Indexing/slicing

In [11]:
y[1,2]

array(-1, dtype=int64)

In [12]:
y[:,1:3]

array([[-1,  0],
       [ 0, -1]], dtype=int64)

In [13]:
y[:,1:3]=2
print(y)

[[0 2 2]
 [0 2 2]]


In [14]:
y[:1,:3]

array([[0, 2, 2]], dtype=int64)

## Broadcasting

Operating between two arrays (tensors) of different dimensionality:
1. Follows numpy semantics
2. They must have same rank
3. The dimension that has value 1 is repeated along that axis

In [15]:
x = mx.np.ones(shape=(3,3))
print ("-----------------")
print('x = ', x)
y = mx.np.array([[0,1,2]]) # shape 1,3
print ("-----------------")
print('y = ', y)
print ("-----------------")
print('y.shape= ',y.shape)
print ("-----------------")
print('x + y = ', x + y)
print ("-----------------")
print('x * y = ', x * y)
print ("-----------------")

-----------------
x =  [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
-----------------
y =  [[0. 1. 2.]]
-----------------
y.shape=  (1, 3)
-----------------
x + y =  [[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]
-----------------
x * y =  [[0. 1. 2.]
 [0. 1. 2.]
 [0. 1. 2.]]
-----------------


make explicit the dimension you want to broadcast to

In [16]:
x = mx.np.ones(shape=(3,3))
print ("-----------------")
print('x = ', x)
y = mx.np.array([[0],[1],[2]]) # shape 3,1
print ("-----------------")
print('y = ', y)
print ("-----------------")
print('y.shape= ',y.shape)
print ("-----------------")
print('x + y = ', x + y)
print ("-----------------")
print('x * y = ', x * y)

-----------------
x =  [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
-----------------
y =  [[0.]
 [1.]
 [2.]]
-----------------
y.shape=  (3, 1)
-----------------
x + y =  [[1. 1. 1.]
 [2. 2. 2.]
 [3. 3. 3.]]
-----------------
x * y =  [[0. 0. 0.]
 [1. 1. 1.]
 [2. 2. 2.]]


## Back and forth to numpy

In [17]:
a = x.asnumpy()
print (a)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [18]:
x = mx.np.array(a)
print (x)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


## Managing context (device) - CPU/GPU

### Find out how many available gpus exist

In [19]:
# Print number of available GPUs
print(mx.util.get_gpu_count())
# This a list of available indices that correspond to gpu devices
print(list(mx.test_utils.list_gpus()))

1
[0]


In [20]:
#a = nd.ones((2,2),ctx=mx.gpu())
a = mx.np.ones((2,2),ctx=mx.gpu(0)) # index can be different, if more than 1 GPUs available
a = mx.np.ones((2,2)) # defalut device/context mx.cpu()
print (a)
print(a.context)

[[1. 1.]
 [1. 1.]]
cpu(0)


In [21]:
# Alternative way to copy into different context, after creation
a = a.as_in_context(mx.cpu())
print(a)
print(a.context)

[[1. 1.]
 [1. 1.]]
cpu(0)


### In order to perform operations between tensors (vectors), both of them must live in the same device (context)

In [22]:
a = mx.np.random.rand(3,3,ctx=mx.cpu()) # lives on CPU
print (a)
b = mx.np.random.rand(3,3,ctx=mx.gpu()) # lives on GPU
print (b)

[[0.5488135  0.5928446  0.71518934]
 [0.84426576 0.60276335 0.8579456 ]
 [0.5448832  0.8472517  0.4236548 ]]
[[0.74021935 0.9209938  0.03902049]
 [0.9689629  0.92514056 0.4463501 ]
 [0.6673192  0.10993068 0.4702186 ]] @gpu(0)


In [23]:
# Correct operation 
print (a.as_in_context(mx.gpu())+b) # copy a to G-pu, result lives on GPU
print (a+b.as_in_context(mx.cpu())) # copy b to C-pu, result lives on CPU

[[1.2890329 1.5138384 0.7542098]
 [1.8132286 1.5279039 1.3042958]
 [1.2122023 0.9571824 0.8938734]] @gpu(0)
[[1.2890329 1.5138384 0.7542098]
 [1.8132286 1.5279039 1.3042958]
 [1.2122023 0.9571824 0.8938734]]


In [24]:
# to err is to learn:
print (a+b)

MXNetError: Traceback (most recent call last):
  File "../src/imperative/./imperative_utils.h", line 136
MXNetError: Check failed: inputs[i]->ctx().dev_mask() == ctx.dev_mask() (2 vs. 1) : Operator _npi_add require all inputs live on the same context. But the first argument is on cpu(0) while the 2-th argument is on gpu(0)

## Some basic linear algebra

### Scalars

In [25]:
a = mx.np.array([3])
print ("This is a as an array:= ",a)
a = a.item() # return to standard float variable
print ("this is a as a float:= ",a) 

This is a as an array:=  [3.]
this is a as a float:=  3.0


### matrix-vector product (not broadcasting!)

In [26]:
a = mx.np.random.rand(2,5)
print ("a= ",a)
b = mx.np.random.rand(5,1) # Compare with declaring explicitely 
print ("b= ",b)

a=  [[0.6235637  0.6458941  0.3843817  0.4375872  0.2975346 ]
 [0.891773   0.05671298 0.96366274 0.2726563  0.3834415 ]]
b=  [[0.47766513]
 [0.79172504]
 [0.8121687 ]
 [0.5288949 ]
 [0.47997716]]


In [27]:
# tip: these exists also batch_dot, for dot per batch element
result = mx.np.dot(a,b)
print ("result= ", result)
print ("shape of output matrix: ", result.shape)

result=  [[1.4956555]
 [1.5817764]]
shape of output matrix:  (2, 1)


### Compare with declaring matrix b only with its first dimension 

In [28]:
print ("initial b.shape= ", b.shape)
b = mx.np.squeeze(b) # just like np.squeeze - removes the redundant dimension
# alternative: b.squeeze()
print ("new shape of b: ", b.shape)

initial b.shape=  (5, 1)
new shape of b:  (5,)


In [29]:
result = mx.np.dot(a,b)
print ("result= ", result)
print ("shape of output matrix: ", result.shape)

result=  [1.4956555 1.5817764]
shape of output matrix:  (2,)


### matrix-matrix product

In [30]:
a = mx.np.random.rand(3,5)
print ("a= ",a)
b = mx.np.random.rand(5,2) # Compare with declaring explicitely 
print ("b= ",b)

a=  [[0.56804454 0.3927848  0.92559665 0.83607876 0.07103606]
 [0.33739617 0.08712929 0.6481719  0.0202184  0.36824155]
 [0.83261985 0.95715517 0.77815676 0.14035077 0.87001216]]
b=  [[0.87008727 0.9786183 ]
 [0.47360805 0.7991586 ]
 [0.8009108  0.46147937]
 [0.5204775  0.7805292 ]
 [0.67887956 0.11827442]]


In [31]:
result = mx.np.dot(a,b)
print ("result= ", result)
print ("shape of output matrix: ", result.shape)

result=  [[1.9049797 1.9579254]
 [1.1144719 0.7582647]
 [2.4646852 2.1512873]]
shape of output matrix:  (3, 2)


**Tip**: A handy operation in mxnet is nd.reshape, to change the shape of a tensor/layer/Array. 

In [32]:
print ("a.shape before:= ", a.shape)
a = a.reshape(-1) # equivalent to np.flatten
print ("a.shape after:= ",a.shape)

a.shape before:=  (3, 5)
a.shape after:=  (15,)


## Automatic differentiation

We train neural networks with (modified versions of) gradient descent. Therefore we need a mechanism that automatically evaluates derivatives for us. 

In [33]:
from mxnet import autograd

In [34]:
# x = {x1=0, x2=1, x3=2, x4=3}, represented as a column vector
x = mx.np.array([0.,1.,2.,3.])

In [35]:
print (x.shape)

(4,)


x.grad stores the value of the derivatives of functions that take x as input, with respect to x

In [36]:
type(x.grad)

NoneType

We need to explicitely declare we require gradient evaluation

In [37]:
x.attach_grad() # default grad_req='write': overwrites previous gradients

In [38]:
type(x.grad)

mxnet.numpy.ndarray

In [39]:
print (x.grad)

[0. 0. 0. 0.]


$y=f(x)=x^2 \rightarrow \frac{dy}{dx}=2x$

In [40]:
# Record the forward graph  of the computation so as to backpropagate ( ~ chain rule) 
# for the evaluation of the derivatives
with autograd.record(): # Record the graph that described the functional relationships 
    y = x**2 # This is element-wise power of 2
# Do the actual derivative evaluation  
y.backward()

In [41]:
# This must be 2*x
print (x.grad, 2*x)

[0. 2. 4. 6.] [0. 2. 4. 6.]


In [42]:
# A bit more complex
with autograd.record():
    z = x**2
    y = z*z
y.backward()

In [43]:
# This must be 4 * x^3
print (x.grad, 4*x**3)

[  0.   4.  32. 108.] [  0.   4.  32. 108.]


#### Attention: 
x.grad contains the gradients of the derivative f(x), for an abritrary function x, i.e. 
$\frac{f(x)}{dx} == x.grad $. Make sure to note that $x.grad \neq \frac{dx}{dx} =1$

**Tip**: the operation detach() removes the current tensor from the computation graph, handy when we train simultaneously multiple networks (e.g. GANs)

In [44]:
x = x.detach()# Now this is a new object

In [45]:
print (x.grad)

None


**MAYDAY**: In mxnet, by default, every subsequent call of f(x).backward() overwrites the previous gradients (todo: grad_req='write'). So we don't need to manually set the values of x.grad to zero after each computation. 

## Suggested reading: autograd, head gradients: 
http://d2l.ai/chapter_crashcourse/autograd.html#