In [1]:
import tensorflow as tf
import numpy as np
print(tf.__version__)

2.0.0


In [2]:
# get MNIST data
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images[..., None]
test_images = test_images[..., None]
# Getting the images in [0, 1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

In [3]:
BATCH_SIZE = 256
EPOCHS = 3

In [4]:
from tensorflow.keras.layers import Conv2D, Dense, MaxPooling2D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import multi_gpu_model, to_categorical

### testing on a single GPU

In [5]:
model1 = Sequential()
model1.add(Conv2D(64, 3, activation='relu',input_shape=(28,28,1)))
model1.add(Conv2D(128, 3, activation='relu'))
model1.add(MaxPooling2D())
model1.add(Conv2D(256, 3, activation='relu'))
model1.add(Conv2D(512, 3, activation='relu'))
model1.add(MaxPooling2D())
model1.add(Flatten())
model1.add(Dense(64, activation='relu'))
model1.add(Dense(10, activation='softmax'))

In [6]:
# this actually does not work, it only uses 1 gpu
parallel_model = multi_gpu_model(model1,gpus = 4)
parallel_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

Instructions for updating:
Use `tf.distribute.MirroredStrategy` instead.


In [7]:
parallel_model.fit(train_images,to_categorical(train_labels),epochs=EPOCHS,batch_size=BATCH_SIZE)

Train on 60000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f020842fa90>

### testing on multiple GPU

In [8]:
# define strategy
strategy = tf.distribute.MirroredStrategy()

In [9]:
with strategy.scope():
    model2 = Sequential()
    model2.add(Conv2D(64, 3, activation='relu',input_shape=(28,28,1)))
    model2.add(Conv2D(128, 3, activation='relu'))
    model2.add(MaxPooling2D())
    model2.add(Conv2D(256, 3, activation='relu'))
    model2.add(Conv2D(512, 3, activation='relu'))
    model2.add(MaxPooling2D())
    model2.add(Flatten())
    model2.add(Dense(64, activation='relu'))
    model2.add(Dense(10, activation='softmax'))

In [10]:
with strategy.scope():
    model2.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model2.fit(train_images,to_categorical(train_labels),epochs=EPOCHS,batch_size=BATCH_SIZE)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Train on 60000 samples
Epoch 1/3
INFO:tensorflow:batch_all_reduce: 12 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0

**Notes** <br>
1) MirroredStrategy works only with fit method, don't use train_on_batch<br>
2) Depending on model architecture MirroredStrategy might not be the best. refer to this https://www.tensorflow.org/guide/distributed_training (consider that other strategies on keras API are experimental)<br>
3) Using current architecture 1 GPU runs an epoch from 22~24 seconds. With 4 GPUs beside the first epoch (which runs 19 seconds) each epoch runs for 8 seconds. So we have 3x speedup. This will change depending on architecture.