In [1]:
from __future__ import print_function
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon.block import HybridBlock
from mxnet.gluon import nn

Load MNIST training and validation data from gluon.data.vision

In [2]:
batch_size = 64

# Make channel the first dimension. Normalize the input.
def transform(data, label):
    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)

# Load training data from gluon.data.vision.MNIST
train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),
                                      batch_size, shuffle=True)

# Load validation data from gluon.data.vision.MNIST
test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),
                                     batch_size, shuffle=False)

We break LeNet into two parts:
- `LenetPart1` contains the convolutional layers and will be placed in the first GPU
- `LenetPart2` contains the fully connected layers and will be placed in the second GPU  


In [3]:
class LenetPart1(HybridBlock):
    def __init__(self, **kwargs):
        super(LenetPart1, self).__init__(**kwargs)
        with self.name_scope():
            self.layers = nn.HybridSequential(prefix='')
            with self.layers.name_scope():
                self.layers.add(nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
                self.layers.add(nn.MaxPool2D(pool_size=2, strides=2))
                self.layers.add(nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
                self.layers.add(nn.MaxPool2D(pool_size=2, strides=2))
                self.layers.add(gluon.nn.Flatten())
    
    def hybrid_forward(self, F, x):
        x = self.layers(x)
        return x
    

class LenetPart2(HybridBlock):
    def __init__(self, classes=10, **kwargs):
        super(LenetPart2, self).__init__(**kwargs)
        with self.name_scope():
            self.layers = nn.HybridSequential(prefix='')
            with self.layers.name_scope():
                self.layers.add(nn.Dense(512, activation="relu"))
                self.layers.add(nn.Dense(classes))
    
    def hybrid_forward(self, F, x):
        x = self.layers(x)
        return x    

Initialize `LenetPart1` on the first GPU

In [4]:
ctx1 = mx.gpu(0)
net1 = LenetPart1()
net1.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx1)

Initialize `LenetPart2` on the second GPU

In [5]:
net2 = LenetPart2()
ctx2 = mx.gpu(1)
net2.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx2)

We create two trainers for the two parts of the network since trainer can only handle parameters from one device.

In [6]:
trainer1 = gluon.Trainer(net1.collect_params(), 'sgd', {'learning_rate': .1})
trainer2 = gluon.Trainer(net2.collect_params(), 'sgd', {'learning_rate': .1})

We'll use the SoftmaxCrossEntropyLoss

In [7]:
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

When evaluating the network, we need to pass the data through the first part of the network, capture the output and pass it through the second part of the network to get the final output.

Data is loaded in the context where the first part of the network resides. Label is loaded in the context where the second part of the network resides.

In [8]:
def evaluate_accuracy(data_iterator, net1, net2):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        # Load data in the gpu that has the first part of the network
        data  = data.as_in_context(ctx1)
        # Load label in the gpu that has the last part of the network
        label = label.as_in_context(ctx2)
        
        # Capture output of the first part of the network,
        # pass it though the second part of the network to get the final output.
        output1 = net1(data)
        input2  = output1.as_in_context(ctx2)
        output  = net2(input2)
        
        # Usual accuracy calculation
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

While training, data is passed through the two parts of network just like how it was done above in `evaluate_accuracy`. For the backward pass, MXNet takes care of copying the gradients from the second gpu to the first.

In [9]:
epochs = 1

for e in range(epochs):
    for i, (data, label) in enumerate(train_data):
        
        data  = data.as_in_context(ctx1)
        label = label.as_in_context(ctx2)
        
        with autograd.record():
            output1 = net1(data)
            input2  = output1.as_in_context(ctx2)
            output  = net2(input2)
            
            loss = softmax_cross_entropy(output, label)
            
        loss.backward()
        
        trainer1.step(batch_size)
        trainer2.step(batch_size)

        curr_loss = nd.mean(loss).asscalar()

        smoothing_constant = .01
        moving_loss = (curr_loss if ((i == 0) and (e == 0))
                       else (1 - smoothing_constant) * moving_loss + smoothing_constant * curr_loss)

        if(i % 100 == 0):
            print("Batch[%d] Moving loss: %f" % (i, moving_loss))
        
    test_accuracy = evaluate_accuracy(test_data, net1, net2)
    train_accuracy = evaluate_accuracy(train_data, net1, net2)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))

Batch[0] Moving loss: 2.293158
Batch[100] Moving loss: 1.369541
Batch[200] Moving loss: 0.641024
Batch[300] Moving loss: 0.339356
Batch[400] Moving loss: 0.205750
Batch[500] Moving loss: 0.153345
Batch[600] Moving loss: 0.111642
Batch[700] Moving loss: 0.106622
Batch[800] Moving loss: 0.088289
Batch[900] Moving loss: 0.084093
Epoch 0. Loss: 0.0827988462407, Train_acc 0.976866666667, Test_acc 0.9786
