# Multi-node training example

In this script we are going to demonstrate how to perform a multi-node training using keras and horovod

# Set-Up Ipyparallel Cluster

In [1]:
#setup ipyparallel
import ipyparallel as ipp

#load slurm extensions
%load_ext slurm_magic

#get username
username = !whoami
username = username[0]

In [2]:
#choose desired concurrency
n_ranks = 1

In [3]:
#submit cluster setup script
%sbatch -N $n_ranks ../scripts/start-cluster.sh

'Submitted batch job 20629394\n'

In [6]:
#check if job is ready
squeueout = %squeue -u $username
squeueout

  error_bad_lines=False)


Unnamed: 0,JOBID,PARTITION,NAME,USER,ST,TIME,NODES,NODELIST(REASON)
0,20629394,regular,ipyparal,tkurth,PD,0:00,1,(None)
1,20629351,interacti,sh,tkurth,R,0:09,16,nid0[2304-2319]


In [None]:
#check for running jobs and extract their id's
job_id = squeueout.loc[ (squeueout["ST"] == "R") & (squeueout["NAME"].str.startswith("ipy")), "JOBID"]

if not job_id.empty:
    cluster_id = "cori_" + str(job_id.values[0])

    print("Creating cluster {}".format(cluster_id))
    
    #now create the cluster
    clust = ipp.Client(timeout=60, cluster_id=cluster_id)

# Set up Keras

In [None]:
%%px

# Pick up the local code
import sys
sys.path.append('..')

In [None]:
%%px

# Externals
import keras
from keras.datasets import cifar10
import numpy as np
import matplotlib.pyplot as plt

# Locals
from data import get_datasets
from models import get_model
from utils.device import configure_session
from utils.optimizers import get_optimizer
from utils.callbacks import TimingCallback

#some training specific stuff
from train_horovod import init_workers, get_basic_callbacks

#import horovod
import horovod.keras as hvd

%matplotlib inline

## Introduction

We will be doing image classification on CIFAR10:
https://www.cs.toronto.edu/~kriz/cifar.html

We will be training a simple CNN model to classify small images into 10 classes.

Let's start by looking at some example images from the dataset.

In [None]:
%%px --targets 0

#load the dataset and plot, only on rank 0
x, y = cifar10.load_data()[0]

nrows = 8
ncols = 8

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 12), sharex=True, sharey=True)

for i, ax in enumerate(axs.flatten()):
    ax.imshow(x[i])

plt.tight_layout()

## Configuration

Here is where we will specify all of our configuration of the processes.

In [None]:
%%px

# MPI stuff
my_rank = None
num_ranks = None
cb = None

# init workers (do not init twice)
if not my_rank or not num_ranks:
    my_rank, num_ranks = init_workers(distributed = True)

print(my_rank, num_ranks)

In [None]:
%%px

# On-Node config
gpu = None
intra_threads = 33 # CPU performance knob
inter_threads = 2 # CPU performance knob

In [None]:
%%px

# configure session
configure_session(gpu=gpu, intra_threads=intra_threads, inter_threads=inter_threads)

## Build the model and load the data

Now we use our local get_model code to build our CNN model according to our configuration.

In [None]:
%%px

# Model config
model_name = 'cnn'
input_shape = [32, 32, 3]
n_classes = 10
dropout = 0.1

# Optimizer config
optimizer_name = 'Adam'
lr = 0.001

# Training config
batch_size = 32
n_epochs = 50
loss_name = 'categorical_crossentropy'
metrics = ['accuracy']

# Additional tweaks
warmup_epochs = 0
lr_schedule = []

# Load the data
train_gen, valid_gen = get_datasets(name='cifar10', batch_size=batch_size)
train_steps = max([len(train_gen) // num_ranks, 1])
valid_steps = max([len(valid_gen) // num_ranks, 1])

# Build the model and optimizer
model = get_model(name=model_name, input_shape=input_shape, n_classes=n_classes, dropout=dropout)
opt = get_optimizer(name=optimizer_name, lr=lr, n_ranks=num_ranks)
model.compile(loss=loss_name, optimizer=opt, metrics=metrics)

In [None]:
%%px --target 0

#print model summary only on rank 0
model.summary()

## Create Callbacks

These are relevant for initial variable broadcasting, LR warmup, etc.

In [None]:
%%px

#list of callbacks
cb = get_basic_callbacks(distributed = True)

#warmups:
if warmup_epochs > 0:
    cb.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=warmup_epochs))

#lr_schedule
for lr_schedule in lr_schedule:
    cb.append(hvd.callbacks.LearningRateScheduleCallback(**lr_schedule))

## Train the model

We use the fit_generator method to train our CNN model on our data generators.

Watch the progess as our model eats through the training data and regularly evaluates on the validation data.

In [None]:
%%px

# Train the model
train_history = model.fit_generator(train_gen,
                                    epochs = n_epochs,
                                    steps_per_epoch =train_steps,
                                    validation_data = valid_gen,
                                    validation_steps = valid_steps,
                                    verbose = 1 if my_rank==0 else 0,
                                    callbacks = cb)

## Visualize training history

Training has now completed. We can use the returned history object to make plots of the training and validation set losses and accuracies during training. This is very valuable for identifying issues like under/over fitting!

In [None]:
%%px --target 0

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))

# Plot the loss
ax0.plot(train_history.epoch, train_history.history['loss'], label='train')
ax0.plot(train_history.epoch, train_history.history['val_loss'], label='validation')
ax0.set_xlabel('Epoch')
ax0.set_ylabel('Loss')
ax0.legend(loc=0)

# Plot the accuracy
ax1.plot(train_history.epoch, train_history.history['acc'], label='train')
ax1.plot(train_history.epoch, train_history.history['val_acc'], label='validation')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.set_ylim(bottom=0, top=1)
ax1.legend(loc=0)

#prettify layout
plt.tight_layout()

## What to try next?

Now that you've gotten this far, familiarize yourself with what you can change in the configuration and the effect it has on model performance.

For example:
- What happens if you increase or decrease the learning rate by a factor of 10?
- What happens if you greatly increase or decrease the size of the model in number and size of layers?