From 2da0b05cfb3c70a29029c429bc5d813df458ff25 Mon Sep 17 00:00:00 2001 From: Greg Heinrich Date: Thu, 9 Jun 2016 15:16:39 +0200 Subject: [PATCH] Rework Torch multi-GPU training --- .../torch/ImageNet-Training/alexnet.lua | 1 + .../torch/ImageNet-Training/googlenet.lua | 26 +++---------------- digits/standard-networks/torch/lenet.lua | 21 +-------------- docs/GettingStartedTorch.md | 1 + tools/torch/main.lua | 6 +++++ 5 files changed, 13 insertions(+), 42 deletions(-) diff --git a/digits/standard-networks/torch/ImageNet-Training/alexnet.lua b/digits/standard-networks/torch/ImageNet-Training/alexnet.lua index e52880137..f741332b0 100644 --- a/digits/standard-networks/torch/ImageNet-Training/alexnet.lua +++ b/digits/standard-networks/torch/ImageNet-Training/alexnet.lua @@ -82,6 +82,7 @@ return function(params) end return { model = createModel(params.ngpus, channels, nclasses), + disableAutoDataParallelism = true, croplen = 224, trainBatchSize = 128, validationBatchSize = 32, diff --git a/digits/standard-networks/torch/ImageNet-Training/googlenet.lua b/digits/standard-networks/torch/ImageNet-Training/googlenet.lua index dca2a3eea..6d9d5e4fc 100644 --- a/digits/standard-networks/torch/ImageNet-Training/googlenet.lua +++ b/digits/standard-networks/torch/ImageNet-Training/googlenet.lua @@ -55,7 +55,7 @@ local function inception(input_size, config) return concat end -function createModel(nGPU, nChannels, nClasses) +function createModel(nChannels, nClasses) -- batch normalization added on top of convolutional layers in feature branch -- in order to help the network learn faster local features = nn.Sequential() @@ -99,28 +99,10 @@ function createModel(nGPU, nChannels, nClasses) local splitter = nn.Concat(2) splitter:add(main_branch):add(aux_classifier) --local googlenet = nn.Sequential():add(features):add(splitter) - local googlenet = nn.Sequential():add(features):add(main_branch) - local model - if nGPU>1 then - local gpus = torch.range(1, nGPU):totable() - local fastest, benchmark - local use_cudnn = cudnn ~= nil - if use_cudnn then - fastest, benchmark = cudnn.fastest, cudnn.benchmark - end - model = nn.DataParallelTable(1, true, true):add(googlenet,gpus):threads(function() - if use_cudnn then - local cudnn = require 'cudnn' - cudnn.fastest, cudnn.benchmark = fastest, benchmark - end - end) - model.gradInput = nil - else - model = googlenet - end + local googlenet = nn.Sequential():add(features):add(main_branch) - return model + return googlenet end -- return function that returns network definition @@ -135,7 +117,7 @@ return function(params) assert(params.inputShape[2]==256 and params.inputShape[3]==256, 'Network expects 256x256 images') end return { - model = createModel(params.ngpus, channels, nclasses), + model = createModel(channels, nclasses), croplen = 224, trainBatchSize = 32, validationBatchSize = 16, diff --git a/digits/standard-networks/torch/lenet.lua b/digits/standard-networks/torch/lenet.lua index c171a56bc..9eb077d55 100644 --- a/digits/standard-networks/torch/lenet.lua +++ b/digits/standard-networks/torch/lenet.lua @@ -44,27 +44,8 @@ return function(params) lenet:add(nn.Linear(500, nclasses)) -- 500 -> nclasses lenet:add(nn.LogSoftMax()) - local model - if params.ngpus > 1 then - local gpus = torch.range(1, params.ngpus):totable() - local fastest, benchmark - local use_cudnn = cudnn ~= nil - if use_cudnn then - fastest, benchmark = cudnn.fastest, cudnn.benchmark - end - model = nn.DataParallelTable(1, true, true):add(lenet,gpus):threads(function() - if use_cudnn then - local cudnn = require 'cudnn' - cudnn.fastest, cudnn.benchmark = fastest, benchmark - end - end) - model.gradInput = nil - else - model = lenet - end - return { - model = model, + model = lenet, loss = nn.ClassNLLCriterion(), trainBatchSize = 64, validationBatchSize = 32, diff --git a/docs/GettingStartedTorch.md b/docs/GettingStartedTorch.md index c38f1e436..40ac49d1f 100644 --- a/docs/GettingStartedTorch.md +++ b/docs/GettingStartedTorch.md @@ -106,6 +106,7 @@ labelHook | function | No | A function(input,dblabel) tha trainBatchSize | number | No | If specified, sets train batch size. May be overridden by user in DIGITS UI. validationBatchSize | number | No | If specified, sets validation batch size. May be overridden by user in DIGITS UI. fineTuneHook | function | No | A function(net) that returns the model to be used for fine-tuning. The untuned model is passed as a function parameter. +disableAutoDataParallelism | boolean | No | By default models are encapsulated in a nn.DataParallelTable container to enable multi-GPU training when more than 1 GPUs are selected. Setting this flag to `true` disables this mechanism. ### Tensors diff --git a/tools/torch/main.lua b/tools/torch/main.lua index 760f95f20..afff966c1 100644 --- a/tools/torch/main.lua +++ b/tools/torch/main.lua @@ -280,6 +280,12 @@ local parameters = { network = network_func(parameters) local model = network.model +-- embed model in parallel table unless explicitly disallowed in user-defined description +if nGpus > 1 and not network.disableAutoDataParallelism then + local gpus = torch.range(1, nGpus):totable() + model = nn.DataParallelTable(1, true, true):add(model, gpus) +end + -- if the loss criterion was not defined in the network -- use nn.ClassNLLCriterion() by default local loss = network.loss or nn.ClassNLLCriterion()