Skip to content
This repository has been archived by the owner on Jan 3, 2023. It is now read-only.

Commit

Permalink
Add multi-GPU support
Browse files Browse the repository at this point in the history
  • Loading branch information
Stewart Hall authored and scttl committed Nov 18, 2015
1 parent c800319 commit adab385
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 44 deletions.
44 changes: 33 additions & 11 deletions neon/backends/__init__.py
Expand Up @@ -27,7 +27,8 @@


def gen_backend(backend='cpu', rng_seed=None, datatype=np.float32,
batch_size=0, stochastic_round=False, device_id=0):
batch_size=0, stochastic_round=False, device_id=0,
max_devices=4):
"""
Construct and return a backend instance of the appropriate type based on
the arguments given. With no parameters, a single CPU core, float32
Expand All @@ -50,6 +51,9 @@ def gen_backend(backend='cpu', rng_seed=None, datatype=np.float32,
Only affects the gpu backend.
device_id (numeric, optional): Set this to a numeric value which can be used to select
device on which to run the process
max_devices (int, optional): For use with multi-GPU backend only.
Controls the maximum number of GPUs to run
on.
Returns:
Backend: newly constructed backend instance of the specifed type.
Expand All @@ -71,20 +75,32 @@ def gen_backend(backend='cpu', rng_seed=None, datatype=np.float32,
if backend == 'cpu' or backend is None:
from neon.backends.nervanacpu import NervanaCPU
be = NervanaCPU(rng_seed=rng_seed, default_dtype=datatype)
elif backend == 'gpu':
elif backend == 'gpu' or backend == 'mgpu':
gpuflag = False
# check nvcc
from neon.backends.util import check_gpu
gpuflag = (check_gpu.get_compute_capability(device_id) >= 5.0)
if gpuflag is False:
raise RuntimeError("Device " + str(device_id) + " does not have CUDA compute " +
"capability 5.0 or greater")
from neon.backends.nervanagpu import NervanaGPU
# init gpu
be = NervanaGPU(rng_seed=rng_seed, default_dtype=datatype,
stochastic_round=stochastic_round, device_id=device_id)
elif backend == 'mgpu':
raise NotImplementedError("mgpu will be ready soon")
if backend == 'gpu':
from neon.backends.nervanagpu import NervanaGPU
# init gpu
be = NervanaGPU(rng_seed=rng_seed, default_dtype=datatype,
stochastic_round=stochastic_round, device_id=device_id)
else:
try:
from neon.backends.nervanamgpu import NervanaMGPU
# init multiple GPU
be = NervanaMGPU(rng_seed=rng_seed,
default_dtype=datatype,
stochastic_round=stochastic_round,
max_devices=max_devices)
except ImportError:
logger.error("Multi-GPU support is a premium feature "
"available exclusively through the Nervana cloud."
" Please contact info@nervanasys.com for details.")
raise
else:
raise ValueError("backend must be one of ('cpu', 'gpu', 'mgpu')")

Expand All @@ -102,10 +118,16 @@ def cleanup_backend():
from neon.backends.nervanacpu import NervanaCPU
if type(be) is not NervanaCPU:
from neon.backends.nervanagpu import NervanaGPU
assert type(be) is NervanaGPU
from neon.backends.nervanamgpu import NervanaMGPU
assert type(be) is NervanaGPU or type(be) is NervanaMGPU
try:
be.ctx.pop()
be.ctx.detach()
if type(be) is NervanaGPU:
be.ctx.pop()
be.ctx.detach()
else:
for ctx in be.ctxs:
ctx.pop()
ctx.detach()
except:
pass
del(be)
Expand Down
15 changes: 10 additions & 5 deletions neon/backends/backend.py
Expand Up @@ -394,7 +394,8 @@ def __init__(self, rng_seed=None, default_dtype=np.float32):
self.bsz = None
self._min_dims = 2

def iobuf(self, dim0, x=None, dtype=None, name=None, persist_values=True, shared=None):
def iobuf(self, dim0, x=None, dtype=None, name=None, persist_values=True,
shared=None, parallelism=0):
"""
Allocate input and output buffer for layer based on batch size. This
is used because the layer does not know about the batch size.
Expand Down Expand Up @@ -489,7 +490,8 @@ def end(self, block, identifier):
"""
pass

def empty(self, shape, dtype=None, name=None, persist_values=True):
def empty(self, shape, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of this backend's Tensor class, without
initializing element values. This is slightly faster than
Expand Down Expand Up @@ -523,7 +525,8 @@ def empty(self, shape, dtype=None, name=None, persist_values=True):
"""
raise NotImplementedError()

def array(self, ary, dtype=None, name=None, persist_values=True):
def array(self, ary, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of this backend's Tensor class, populating
elements based on obj values.
Expand Down Expand Up @@ -555,7 +558,8 @@ def array(self, ary, dtype=None, name=None, persist_values=True):
"""
raise NotImplementedError()

def zeros(self, shape, dtype=None, name=None, persist_values=True):
def zeros(self, shape, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of this backend's Tensor class, populating
Each element with a value of 0.
Expand Down Expand Up @@ -585,7 +589,8 @@ def zeros(self, shape, dtype=None, name=None, persist_values=True):
"""
raise NotImplementedError()

def ones(self, shape, dtype=None, name=None, persist_values=True):
def ones(self, shape, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of this backend's Tensor class, populating
Each element with a value of 1.
Expand Down
12 changes: 8 additions & 4 deletions neon/backends/nervanacpu.py
Expand Up @@ -589,7 +589,8 @@ def execute(self, optree):
assert len(compute_stack) == 1
return postfix_stack[0]

def empty(self, shape, dtype=None, name=None, persist_values=True):
def empty(self, shape, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of the CPUTensor class without initializing
individual element values.
Expand Down Expand Up @@ -619,7 +620,8 @@ def empty(self, shape, dtype=None, name=None, persist_values=True):
name=name,
persist_values=persist_values)

def array(self, ary, dtype=None, name=None, persist_values=True):
def array(self, ary, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of the CPUTensor class setting each element
value to what is specified in ary.
Expand Down Expand Up @@ -651,7 +653,8 @@ def array(self, ary, dtype=None, name=None, persist_values=True):
name=name,
persist_values=persist_values)

def zeros(self, shape, dtype=None, name=None, persist_values=True):
def zeros(self, shape, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of the CPUTensor class setting each element
value to 0.
Expand Down Expand Up @@ -680,7 +683,8 @@ def zeros(self, shape, dtype=None, name=None, persist_values=True):
name=name,
persist_values=persist_values)

def ones(self, shape, dtype=None, name=None, persist_values=True):
def ones(self, shape, dtype=None, name=None, persist_values=True,
parallel=False, distributed=False):
"""
Instantiate a new instance of the CPUTensor class setting each element
value to 1.
Expand Down
8 changes: 4 additions & 4 deletions neon/backends/nervanagpu.py
Expand Up @@ -991,7 +991,7 @@ def execute(self, optree):
return stacks[-1][0] # TODO: to be removed, used in partial

def empty(self, shape, dtype=None, name=None, persist_values=True,
allocator=drv.mem_alloc):
parallel=False, distributed=False, allocator=drv.mem_alloc):
"""
Allocate the space for a GPUTensor
"""
Expand All @@ -1001,7 +1001,7 @@ def empty(self, shape, dtype=None, name=None, persist_values=True,
rounding=self.round_mode)

def array(self, ary, dtype=None, name=None, persist_values=True,
allocator=drv.mem_alloc):
parallel=False, distributed=False, allocator=drv.mem_alloc):
"""
converts a numpy array to a GPUTensor
"""
Expand All @@ -1013,7 +1013,7 @@ def array(self, ary, dtype=None, name=None, persist_values=True,
rounding=self.round_mode).set(ary)

def zeros(self, shape, dtype=None, name=None, persist_values=True,
allocator=drv.mem_alloc):
parallel=False, distributed=False, allocator=drv.mem_alloc):
"""
Returns an array of the given shape and dtype filled with 0's.
"""
Expand All @@ -1023,7 +1023,7 @@ def zeros(self, shape, dtype=None, name=None, persist_values=True,
rounding=self.round_mode)._assign(0)

def ones(self, shape, dtype=None, name=None, persist_values=True,
allocator=drv.mem_alloc):
parallel=False, distributed=False, allocator=drv.mem_alloc):
"""
Returns an array of the given shape and dtype filled with 1's.
"""
Expand Down
9 changes: 7 additions & 2 deletions neon/layers/container.py
@@ -1,5 +1,5 @@
import numpy as np
from neon.layers.layer import Layer, BranchNode, Dropout
from neon.layers.layer import Layer, BranchNode, Dropout, LayerParallelism
from neon import NervanaObject
from operator import add

Expand Down Expand Up @@ -64,8 +64,12 @@ def configure(self, in_obj):
config_layers = self.layers if in_obj else self._layers
in_obj = in_obj if in_obj else self.layers[0]
super(Sequential, self).configure(in_obj)
prev_layer = None
for l in config_layers:
in_obj = l.configure(in_obj)
if prev_layer is not None:
prev_layer.set_next(l)
prev_layer = l
self.out_shape = in_obj.out_shape
return self

Expand All @@ -82,7 +86,8 @@ def allocate_deltas(self, global_deltas=None):
ndelta_bufs = 4 if [l for l in self.layers if type(l) is MergeBroadcast] else 2
in_sizes = [np.prod(l.in_shape) for l in self.layers[1:]]
if in_sizes:
self.global_deltas = [self.be.iobuf(max(in_sizes)) for _ in range(ndelta_bufs)]
self.global_deltas = [self.be.iobuf(
max(in_sizes), parallelism=LayerParallelism.Data) for _ in range(ndelta_bufs)]
else:
self.global_deltas = None
else:
Expand Down

0 comments on commit adab385

Please sign in to comment.