# MXNet - Gluon Code Snippets

#### Index:


## 1. Import Libraries

In [7]:
from mxnet import autograd, nd

# #Gluon data module to read data
from mxnet.gluon import data as gdata

# #Neural Network Layers
from mxnet.gluon import nn

# #Model Parameter Initalizer
from mxnet import init

# #Gluon module to define loss functions
from mxnet.gluon import loss as gloss

# #Optimization Algorithm
from mxnet.gluon import Trainer

## 2. Reading Data

In [None]:
"""
X: features
y: labels
"""
# #Combining the features and labels into a training set
dataset = gdata.ArrayDataset(features, labels)

# #Randomly reading data in batches - Mini Batch
batch_size = 10
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)

## 3. Model Definition

In [None]:
# ###########################
# #         INPUT           #
# ###########################
# #Sequential Container
net = nn.Sequential()

# ###########################
# #      HIDDEN LAYERS      #
# ###########################
net.add(nn.Dense(256, activation='relu')) # #256 hidden units with a ReLU activation function

# #DROPOUT
# #We add dropout after each of the fully connected layers
# #and specify the dropout probability
net.add(nn.Dense(256, activation="relu"),
        # Add a dropout layer after the first fully connected layer
        nn.Dropout(drop_prob1),
        nn.Dense(256, activation="relu"),
        # Add a dropout layer after the second fully connected layer
        nn.Dropout(drop_prob2),
        nn.Dense(10))


# ###########################
# #         OUTPUT          #
# ###########################
# #Adding a Dense layer with a scalar output
net.add(nn.Dense(1))

# #Adding a Dense layer with 10 outputs
net.add(nn.Dense(10))

### Parameter Initialization

In [None]:
"""
Default Method - Each weight parameter element is randomly sampled from
a uniform distribution U[-0.07,0.07], with the bias
parameter equal to 0
"""
net.initialize()

"""
Each weight parameter element is randomly sampled at
initialization from a normal distribution with zero 
mean and sigma standard deviation.

The bias parameter is initialized to zero by default
"""
net.initialize(init.Normal(sigma=0.01))


"""
Reinitialize all the parameters in the network
'force_reinit' ensures that the variables are initialized again, regardless of
whether they were already initialized previously
"""
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]

# #Reinitialize all parameters to a constant value of 1
net.initialize(init=init.Constant(1), force_reinit=True)
net[0].weight.data()[0]

# #Reinitialize the parameters in a specific layer
net[1].initialize(init=init.Constant(42), force_reinit=True)

In [None]:
"""
Custom Initialization
Sometimes the initialization methods we need are not provided
in the `init` module; in such cases we can implement a subclass
of the `Initializer` class
"""
class MyInit(init.Initializer):
    """
        U[5,10]   with probability 1/4
    w ∼ 0         with probability 1/2
        U[−10,−5] with probability 1/4
    """
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
        data *= data.abs() >= 5

net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[0]

### Parameter Access

In [None]:
# #Sequential Class - Each layer of the network can be selected via indexing - net[i]
net[0].params

In [None]:
# #Each layer of the network can be selected via indexing - net[i]
net[0]

# #The weights and biases in each layer of the network
w = net[0].weight.data()
b = net[0].bias.data()

# #This above is equivalent to
w = net[0].params['dense0_weight'].data()
b = net[0].params['dense0_bias'].data()

# #All the parameters only for the first layer
net[0].collect_params()
# #All the parameters of the entire network
# #each of the lines below produce a differently formated output
net.collect_params
net.collect_params()

# #We could also use RegEx to filter out parameters
net.collect_params('.*weight')
net.collect_params('dense0.*')

In [None]:
def relu(X):
    return nd.maximum(X, 0)

### Layers and Blocks

A Block consists of one or more layers.

Requirements for a Block are:
1. Input data
2. `forward` method produces the output
3. `backward` method produces the gradient - performed automatically?
4. Initialize and store block specific parameters

In fact, the Sequential class is derived from the Block class

In [None]:
class MLP(nn.Block):
    # Declare a layer with model parameters. Here, we declare two fully
    # connected layers
    def __init__(self, **kwargs):
        # Call the constructor of the MLP parent class Block to perform the
        # necessary initialization. In this way, other function parameters can
        # also be specified when constructing an instance, such as the model
        # parameter, params, described in the following sections
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Dense(256, activation='relu')  # Hidden layer
        self.output = nn.Dense(10)  # Output layer

    # Define the forward computation of the model, that is, how to return the
    # required model output based on the input x
    def forward(self, x):
        # #Forward propogation step
        return self.output(self.hidden(x))
    
net = MLP()
net.initialize()
net(x)

## 4. Define Loss Functions

In [None]:
loss = gloss.L2Loss() # #Squared Loss or L2-norm loss

In [None]:
loss = gloss.SoftmaxCrossEntropyLoss()

In [None]:
def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp/partition

In [None]:
def cross_entropy(y_hat, y):
    return -nd.pick(y_hat, y).log()

## 5. Define the Optimization Algorithm

In [None]:
# #Algo: Mini-batch Stochastic Gradient Descent Algorithm
"""
The optimization algorithm will iterate over all parameters present
in the network
"""
trainer = Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})

## 6. Model Training

In [None]:
"""
* For a pre-defined number of epochs, we make a pass
over the dataset that has been sampled via mini-batching
the features(X) and the labels(y)
* Then, for each mini-batch:
- make prediction via `net(X)` and compare it to the label
y and compute the loss function in the forward pass
- compute gradients via backward pass
- update the model parameters via SGD in the `trainer()` method
"""

num_epochs = 3
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        with autograd.record():
            l = loss(net(X), y)
        l.backward()
        trainer.step(batch_size)
    l = loss(net(features), labels)
    print('epoch %d, loss: %f' % (epoch, l.mean().asnumpy()))

In [None]:
"""
Compute the error in estimating the weights and biases
"""
w = net[0].weight.data()
print('Error in estimating w', true_w.reshape(w.shape) - w)
b = net[0].bias.data()
print('Error in estimating b', true_b - b)