# MXNet Performance tricks

In [None]:
import multiprocessing
import time
 
import mxnet as mx
from mxnet import gluon, nd, autograd
import numpy as np
import random 
import mxboard as mxb

import skimage
from skimage.transform import resize

In [None]:
mx.random.seed(42)
np.random.seed(42)
random.seed(42)


In [None]:
!rm -rf logs && mkdir logs
sw = mxb.SummaryWriter(logdir='logs', flush_secs=3)

## Model

We get a relatively simple and common model architecture from the model the zoo, ResNet50 architecture

In [None]:
ctx = mx.gpu()
net = gluon.model_zoo.vision.resnet50_v2(pretrained=False, ctx=ctx)
net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx)
print(net)

## Data

In [None]:
BATCH_SIZE = 48

In [None]:
def transform(x, y):
    x = resize(x.asnumpy(), (224, 224), anti_aliasing=False, mode='constant')
    x = x.transpose((2, 0, 1)).astype('float32')
    return x, y
dataset_train = gluon.data.vision.CIFAR10(train=True, transform=transform)

In [None]:
dataloader_train = gluon.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=False, last_batch="discard")

## Loss

In [None]:
loss_fn = gluon.loss.SoftmaxCELoss()

## Metric

In [None]:
accuracy = mx.metric.Accuracy()

## Optimizer

In [None]:
trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.001, 'momentum':0.9, 'wd':0.00001})

## Training Loop

### 1) Naive Attempt

In [None]:
%%time

epoch = 1
print_n = 5

for e in range(epoch):
    tick = time.time()
    for i, (data, label) in enumerate(dataloader_train):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        if i == 0:
            tick_0 = time.time()
            
        # Forward pass and loss computation
        with autograd.record():
            output = net(data)
            loss = loss_fn(output, label)
        
        # Compute gradient
        loss.backward()
        
        # Update network weights
        trainer.step(data.shape[0])
        
        # Update metric
        accuracy.update(label, output)
        
        # Print batch metrics
        if i % print_n == 0 and i > 0:
            sw.add_scalar(tag='Accuracy', value={'naive':accuracy.get()[1]}, global_step=i-print_n)
            sw.add_scalar(tag='Speed', value={'naive':data.shape[0]*(print_n)/(time.time()-tick)}, global_step=i-print_n)
            print('Batch [{}], Accuracy {:.4f}, Samples/sec: {:.4f}'.format(
                i, accuracy.get()[1], data.shape[0]*(print_n)/(time.time()-tick))
            )
            tick = time.time()
        
        if i == 200:
            break
            
    print('Epoch [{}], Accuracy {:.4f}'.format(e, accuracy.get()[1]))
    print('~Samples/Sec {:.4f}'.format(data.shape[0]*(i+1)/(time.time()-tick_0)))
    accuracy.reset()
    sw.flush()

Notice how we are I/O constrained, very low GPU util, drops in GPU util and high CPU utilization

### 2) Using multiprocessing workers

In [None]:
dataloader_train = gluon.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=False, last_batch="discard", 
                                         num_workers=multiprocessing.cpu_count()-3)

In [None]:
net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True)

In [None]:
%%time

epoch = 1
print_n = 5

for e in range(epoch):
    tick = time.time()
    for i, (data, label) in enumerate(dataloader_train):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        if i == 0:
            tick_0 = time.time()
        
        # Forward pass and loss computation
        with autograd.record():
            output = net(data)
            loss = loss_fn(output, label)
        
        # Compute gradient
        loss.backward()
        
        # Update network weights
        trainer.step(data.shape[0])
        
        # Update metric
        accuracy.update(label, output)
        
        # Print batch metrics
        if i % print_n == 0 and i > 0:
            sw.add_scalar(tag='Accuracy', value={'multi':accuracy.get()[1]}, global_step=i-print_n)
            sw.add_scalar(tag='Speed', value={'multi':data.shape[0]*(print_n)/(time.time()-tick)}, global_step=i-print_n)
            print('Batch [{}], Accuracy {:.4f}, Samples/sec: {:.4f}'.format(
                i, accuracy.get()[1], data.shape[0]*(print_n)/(time.time()-tick))
            )
            tick = time.time()
        
        if i == 200:
            break
            
    print('Epoch [{}], Accuracy {:.4f}'.format(e, accuracy.get()[1]))
    print('~Samples/Sec {:.4f}'.format(data.shape[0]*(i+1)/(time.time()-tick_0)))
    accuracy.reset()
    sw.flush()

We notice now that CPU utilization seems to be less than 100% so data fetching and resizing is no more the bottleneck

### 3) Hybridization

In [None]:
net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True)
net.hybridize(static_alloc=True, static_shape=True)
out = net(mx.nd.ones((1, 3, 224, 224), ctx))

In [None]:
%%time

epoch = 1
print_n = 5

for e in range(epoch):
    tick = time.time()
    for i, (data, label) in enumerate(dataloader_train):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        if i == 0:
            tick_0 = time.time()
            
        # Forward pass and loss computation
        with autograd.record():
            output = net(data)
            loss = loss_fn(output, label)
        
        # Compute gradient
        loss.backward()
        
        # Update network weights
        trainer.step(data.shape[0])
        
        # Update metric
        accuracy.update(label, output)
        
        # Print batch metrics
        if i % print_n == 0 and i > 0:
            sw.add_scalar(tag='Accuracy', value={'hybrid':accuracy.get()[1]}, global_step=i-print_n)
            sw.add_scalar(tag='Speed', value={'hybrid':data.shape[0]*(print_n)/(time.time()-tick)}, global_step=i-print_n)
            print('Batch [{}], Accuracy {:.4f}, Samples/sec: {:.4f}'.format(
                i, accuracy.get()[1], data.shape[0]*(print_n)/(time.time()-tick))
            )
            tick = time.time()
        
        if i == 200:
            break
            
    print('Epoch [{}], Accuracy {:.4f}'.format(e, accuracy.get()[1]))
    print('~Samples/Sec {:.4f}'.format(data.shape[0]*(i+1)/(time.time()-tick_0)))
    accuracy.reset()
    sw.flush()

### 4) Tweaking hyper-parameters

In [None]:
BATCH_SIZE = 96
dataloader_train = gluon.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=False, last_batch="discard", num_workers=multiprocessing.cpu_count()-3)

In [None]:
net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True)
trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.001*2, 'momentum':0.9, 'wd':0.00001})

In [None]:
%%time

epoch = 1
print_n = 5

for e in range(epoch):
    tick = time.time()
    for i, (data, label) in enumerate(dataloader_train):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        if i == 0:
            tick_0 = time.time()
            
        # Forward pass and loss computation
        with autograd.record():
            output = net(data)
            loss = loss_fn(output, label)
        
        # Compute gradient
        loss.backward()
        
        # Update network weights
        trainer.step(data.shape[0])
        
        # Update metric
        accuracy.update(label, output)
        
        # Print batch metrics
        if i % print_n == 0 and i > 0:
            sw.add_scalar(tag='Accuracy', value={'hybrid':accuracy.get()[1]}, global_step=i-print_n)
            sw.add_scalar(tag='Speed', value={'hybrid':data.shape[0]*(print_n)/(time.time()-tick)}, global_step=i-print_n)
            print('Batch [{}], Accuracy {:.4f}, Samples/sec: {:.4f}'.format(
                i, accuracy.get()[1], data.shape[0]*(print_n)/(time.time()-tick))
            )
            tick = time.time()
        
        if i == 100:
            break
            
    print('Epoch [{}], Accuracy {:.4f}'.format(e, accuracy.get()[1]))
    print('~Samples/Sec {:.4f}'.format(data.shape[0]*(i+1)/(time.time()-tick_0)))
    accuracy.reset()
    sw.flush()

### 5) Synchronization calls

In [None]:
ctx = mx.gpu(1)
net = gluon.model_zoo.vision.resnet50_v2(pretrained=False, ctx=ctx)
net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx)
net.hybridize(static_alloc=True, static_shape=True)
out = net(mx.nd.ones((1, 3, 224, 224), ctx))
trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.001*2, 'momentum':0.9, 'wd':0.00001})
accuracy = mx.metric.Accuracy()

In [None]:
%%time

epoch = 1
print_n_sync = 2
print_n = 6
tick_0 = time.time()

for e in range(epoch):
    tick = time.time()
    for i, (data, label) in enumerate(dataloader_train):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        
        if i == 0:
            tick_0 = time.time()

        # Update metric
        if i % print_n_sync == 0 and i > 0:
            accuracy.update(old_label, output)
            
        old_label = label
        
        # Print batch metrics            
        if i % print_n == 0 and i > 0:
            sw.add_scalar(tag='Accuracy', value={'hybrid_sync':accuracy.get()[1]}, global_step=i-print_n)
            sw.add_scalar(tag='Speed', value={'hybrid_sync':data.shape[0]*(print_n)/(time.time()-tick)}, global_step=i-print_n)

            print('Batch [{}], Accuracy {:.4f}, Samples/sec: {:.4f}'.format(
                i, accuracy.get()[1], data.shape[0]*(print_n)/(time.time()-tick))
            )
            tick = time.time()
            
        # Forward pass and loss computation
        with autograd.record():
            output = net(data)
            loss = loss_fn(output, label)
            
        # Compute gradient
        loss.backward()
        
        # Update network weights
        trainer.step(data.shape[0])
        
        
        if i == 100:
            break
    print('Epoch [{}], Accuracy {:.4f}'.format(e, accuracy.get()[1]))
    print('~Samples/Sec {:.4f}'.format(data.shape[0]*(i+1)/(time.time()-tick_0)))
    accuracy.reset()
    sw.flush()

## 6) Float16

In [None]:
net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True)
net.cast('float16')

In [None]:
trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.001*2, 'momentum':0.9, 'wd':0.00001, 'multi_precision':True})
accuracy.reset()

In [None]:
%%time

epoch = 1
print_n_sync = 2
print_n = 5
tick_0 = time.time()

for e in range(epoch):
    tick = time.time()
    for i, (data, label) in enumerate(dataloader_train):
        data = data.as_in_context(ctx).astype('float16')
        label = label.as_in_context(ctx).astype('float16')

        # Update metric
        if i % print_n_sync == 0 and i > 0:
            accuracy.update(old_label, output)
        
        old_label = label
        
        if i == 0:
            tick_0 = time.time()
            
        # Print batch metrics            
        if i % print_n == 0 and i > 0:
            sw.add_scalar(tag='Accuracy', value={'float_16':accuracy.get()[1]}, global_step=i-print_n)
            sw.add_scalar(tag='Speed', value={'float_16':data.shape[0]*(print_n)/(time.time()-tick)}, global_step=i-print_n)

            print('Batch [{}], Accuracy {:.4f}, Samples/sec: {:.4f}'.format(
                i, accuracy.get()[1], data.shape[0]*(print_n)/(time.time()-tick))
            )
            tick = time.time()
            
        # Forward pass and loss computation
        with autograd.record():
            output = net(data)
            loss = loss_fn(output, label)
            
        # Compute gradient
        loss.backward()
        
        # Update network weights
        trainer.step(data.shape[0])
        
        # Update metric
        if i % print_n_sync == 0:
            accuracy.update(label, output)

        
        if i == 100:
            break
    print('Epoch [{}], Accuracy {:.4f}'.format(e, accuracy.get()[1]))
    print('~Samples/Sec {:.4f}'.format(data.shape[0]*(i+1)/(time.time()-tick_0)))
    accuracy.reset()


In [None]:
time.sleep(3)
sw.flush()

### 7) Good datapoint: Testing theoritical maximum speed without I/O

In [None]:
batches = 10
tick_0 = time.time()
data = mx.nd.ones((BATCH_SIZE,3,224,224), ctx=ctx, dtype='float16')
for i in range(batches):
    with autograd.record():
        out = net(data)
    out.backward()
    trainer.step(data.shape[0])
out.asnumpy()
mx.nd.waitall()
print('Max Sample Speed {:.4f}'.format(batches*BATCH_SIZE/(time.time()-tick_0)))

# Env Variables

https://mxnet.incubator.apache.org/faq/env_var.html