From 36610b2109dd08d0599d1289734a8337be0b6f99 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 16:40:35 -0400 Subject: [PATCH 01/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index a10b8fc1eb49e..f840880d61beb 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -278,10 +278,15 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): if self.truncated_bptt_steps is not None: args.append(hiddens) - if self.use_ddp or self.use_ddp2: + if self.use_ddp: output = self.model(*args) - elif self.use_dp: + elif self.use_ddp2 or self.use_dp: + # in dp, allow model to use training_step and training_end output = self.model(*args) + if self.is_overriden('training_end'): + model_ref = self.get_model() + output = model_ref.training_end(output) + elif self.single_gpu: gpu_id = 0 if type(self.data_parallel_device_ids) is list: From 4a9dd77e14d2bcd179ae798b4e608e5cfce6f79d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 16:41:33 -0400 Subject: [PATCH 02/62] added training_end --- pytorch_lightning/root_module/root_module.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index ab5275dcb203d..e1ecf3cabc9c2 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -51,6 +51,15 @@ def training_step(self, *args, **kwargs): """ raise NotImplementedError + def training_end(self, *args, **kwargs): + """ + return loss, dict with metrics for tqdm + :param called with batch, batch_nb + additional: optimizer_i if multiple optimizers used + :return: + """ + pass + def validation_step(self, *args, **kwargs): """ return whatever outputs will need to be aggregated in validation_end From 57dfd81cbcc9a83763e2019d97d8db02d1fb7857 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:06:31 -0400 Subject: [PATCH 03/62] added training_end --- pytorch_lightning/trainer/logging_mixin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/logging_mixin.py b/pytorch_lightning/trainer/logging_mixin.py index 236e512d781ca..03057c208ec1e 100644 --- a/pytorch_lightning/trainer/logging_mixin.py +++ b/pytorch_lightning/trainer/logging_mixin.py @@ -68,6 +68,8 @@ def process_output(self, output, train=False): callback_metrics[k] = v if train and (self.use_dp or self.use_ddp2): + import pdb + pdb.set_trace() nb_gpus = self.num_gpus callback_metrics = self.reduce_distributed_output(callback_metrics, nb_gpus) From ed8f161a9e7eada85589931e1bd1af9b7e03126b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:15:28 -0400 Subject: [PATCH 04/62] added training_end --- pytorch_lightning/trainer/logging_mixin.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/logging_mixin.py b/pytorch_lightning/trainer/logging_mixin.py index 03057c208ec1e..13e5afb6f6f07 100644 --- a/pytorch_lightning/trainer/logging_mixin.py +++ b/pytorch_lightning/trainer/logging_mixin.py @@ -68,8 +68,6 @@ def process_output(self, output, train=False): callback_metrics[k] = v if train and (self.use_dp or self.use_ddp2): - import pdb - pdb.set_trace() nb_gpus = self.num_gpus callback_metrics = self.reduce_distributed_output(callback_metrics, nb_gpus) @@ -158,6 +156,10 @@ def reduce_distributed_output(self, output, nb_gpus): if isinstance(output[k], dict): output[k] = self.reduce_distributed_output(output[k], nb_gpus) + # do nothing when there's a scalar + elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0: + pass + # reduce only metrics that have the same nb of gpus elif output[k].size(0) == nb_gpus: reduced = torch.mean(output[k]) From afbdf6f7f637aba74a32a7939f4e210d3cc4a67b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:18:30 -0400 Subject: [PATCH 05/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index f840880d61beb..186a1df27da0e 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -300,4 +300,8 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) + + import pdb + pdb.set_trace() + return output From 5b6387a00578dca10ee80f94205f3f0ff340d0b0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:23:42 -0400 Subject: [PATCH 06/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 186a1df27da0e..d5b27a5d6006a 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -301,7 +301,4 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) - import pdb - pdb.set_trace() - return output From 3d7b4c750dd1b9d27f18ce38b8c72f313009d4b1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:33:17 -0400 Subject: [PATCH 07/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index d5b27a5d6006a..a54f0ae806bd9 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -185,13 +185,6 @@ def optimizer_closure(): callback_metrics = output[3] self.hiddens = output[4] - # track metrics for callbacks - all_callback_metrics.append(callback_metrics) - - # track progress bar metrics - self.add_tqdm_metrics(progress_bar_metrics) - all_log_metrics.append(log_metrics) - # accumulate loss # (if accumulate_grad_batches = 1 no effect) closure_loss = closure_loss / self.accumulate_grad_batches @@ -200,6 +193,13 @@ def optimizer_closure(): model_ref = self.get_model() model_ref.backward(self.use_amp, closure_loss, optimizer) + # track metrics for callbacks + all_callback_metrics.append(callback_metrics) + + # track progress bar metrics + self.add_tqdm_metrics(progress_bar_metrics) + all_log_metrics.append(log_metrics) + # insert after step hook if self.is_function_implemented('on_after_backward'): model_ref = self.get_model() From 33592dea9859fdff28fa788ffa7694f69ccd3d8a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:34:35 -0400 Subject: [PATCH 08/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index a54f0ae806bd9..8589bb30afb3d 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -176,22 +176,24 @@ def run_training_batch(self, batch, batch_nb): # wrap the forward step in a closure so second order methods work def optimizer_closure(): # forward pass - output = self.training_forward( - split_batch, batch_nb, opt_idx, self.hiddens) - - closure_loss = output[0] - progress_bar_metrics = output[1] - log_metrics = output[2] - callback_metrics = output[3] - self.hiddens = output[4] - - # accumulate loss - # (if accumulate_grad_batches = 1 no effect) - closure_loss = closure_loss / self.accumulate_grad_batches - - # backward pass - model_ref = self.get_model() - model_ref.backward(self.use_amp, closure_loss, optimizer) + import torch + with torch.autograd.detect_anomaly(): + output = self.training_forward( + split_batch, batch_nb, opt_idx, self.hiddens) + + closure_loss = output[0] + progress_bar_metrics = output[1] + log_metrics = output[2] + callback_metrics = output[3] + self.hiddens = output[4] + + # accumulate loss + # (if accumulate_grad_batches = 1 no effect) + closure_loss = closure_loss / self.accumulate_grad_batches + + # backward pass + model_ref = self.get_model() + model_ref.backward(self.use_amp, closure_loss, optimizer) # track metrics for callbacks all_callback_metrics.append(callback_metrics) From 89ed38a9007d8829e84d736cd1b3872cc8cda7ac Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:39:35 -0400 Subject: [PATCH 09/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 8589bb30afb3d..33e2ae681edcc 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -192,8 +192,11 @@ def optimizer_closure(): closure_loss = closure_loss / self.accumulate_grad_batches # backward pass - model_ref = self.get_model() - model_ref.backward(self.use_amp, closure_loss, optimizer) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + + # model_ref = self.get_model() + # model_ref.backward(self.use_amp, closure_loss, optimizer) # track metrics for callbacks all_callback_metrics.append(callback_metrics) From 0e91a0da285e39ef5e1894e6bdc907c40498dfda Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:42:16 -0400 Subject: [PATCH 10/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 33e2ae681edcc..56da2192a4fce 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -192,7 +192,7 @@ def optimizer_closure(): closure_loss = closure_loss / self.accumulate_grad_batches # backward pass - with amp.scale_loss(loss, optimizer) as scaled_loss: + with amp.scale_loss(closure_loss, optimizer) as scaled_loss: scaled_loss.backward() # model_ref = self.get_model() From 79a139a8bb2553af9b03c5f831bc674d3a33b1b7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:46:27 -0400 Subject: [PATCH 11/62] added training_end --- .../trainer/evaluation_loop_mixin.py | 6 +-- pytorch_lightning/trainer/train_loop_mixin.py | 37 ++++++++----------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop_mixin.py b/pytorch_lightning/trainer/evaluation_loop_mixin.py index ad4b6ad818f0f..3e5950a6d2a3f 100644 --- a/pytorch_lightning/trainer/evaluation_loop_mixin.py +++ b/pytorch_lightning/trainer/evaluation_loop_mixin.py @@ -69,12 +69,12 @@ def evaluate(self, model, dataloaders, max_batches, test=False): elif self.is_overriden('validation_end'): eval_results = model.validation_end(outputs) - # enable train mode again - model.train() - # enable gradients to save memory torch.set_grad_enabled(True) + # enable train mode again + model.train() + return eval_results def run_evaluation(self, test=False): diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 56da2192a4fce..a54f0ae806bd9 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -176,27 +176,22 @@ def run_training_batch(self, batch, batch_nb): # wrap the forward step in a closure so second order methods work def optimizer_closure(): # forward pass - import torch - with torch.autograd.detect_anomaly(): - output = self.training_forward( - split_batch, batch_nb, opt_idx, self.hiddens) - - closure_loss = output[0] - progress_bar_metrics = output[1] - log_metrics = output[2] - callback_metrics = output[3] - self.hiddens = output[4] - - # accumulate loss - # (if accumulate_grad_batches = 1 no effect) - closure_loss = closure_loss / self.accumulate_grad_batches - - # backward pass - with amp.scale_loss(closure_loss, optimizer) as scaled_loss: - scaled_loss.backward() - - # model_ref = self.get_model() - # model_ref.backward(self.use_amp, closure_loss, optimizer) + output = self.training_forward( + split_batch, batch_nb, opt_idx, self.hiddens) + + closure_loss = output[0] + progress_bar_metrics = output[1] + log_metrics = output[2] + callback_metrics = output[3] + self.hiddens = output[4] + + # accumulate loss + # (if accumulate_grad_batches = 1 no effect) + closure_loss = closure_loss / self.accumulate_grad_batches + + # backward pass + model_ref = self.get_model() + model_ref.backward(self.use_amp, closure_loss, optimizer) # track metrics for callbacks all_callback_metrics.append(callback_metrics) From 092a60694953b2c00472f8ce65304aec04e36ceb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:48:10 -0400 Subject: [PATCH 12/62] added training_end --- pytorch_lightning/trainer/evaluation_loop_mixin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop_mixin.py b/pytorch_lightning/trainer/evaluation_loop_mixin.py index 3e5950a6d2a3f..ad4b6ad818f0f 100644 --- a/pytorch_lightning/trainer/evaluation_loop_mixin.py +++ b/pytorch_lightning/trainer/evaluation_loop_mixin.py @@ -69,12 +69,12 @@ def evaluate(self, model, dataloaders, max_batches, test=False): elif self.is_overriden('validation_end'): eval_results = model.validation_end(outputs) - # enable gradients to save memory - torch.set_grad_enabled(True) - # enable train mode again model.train() + # enable gradients to save memory + torch.set_grad_enabled(True) + return eval_results def run_evaluation(self, test=False): From 80a1fd5783f8398305e8765e2ee5931e554d4793 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 2 Nov 2019 13:46:25 -0400 Subject: [PATCH 13/62] allow ddp and apex to be configured --- pytorch_lightning/root_module/root_module.py | 32 ++++++++++++++++++++ pytorch_lightning/trainer/ddp_mixin.py | 7 ++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index e1ecf3cabc9c2..5d4038f8cf7c0 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -10,6 +10,7 @@ from pytorch_lightning.root_module.memory import ModelSummary from pytorch_lightning.root_module.model_saving import ModelIO from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv +from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel class LightningModule(GradInformation, ModelIO, ModelHooks): @@ -98,6 +99,37 @@ def test_end(self, outputs): """ pass + def configure_ddp(self, model, device_ids): + """ + Override to init DDP in a different way or use your own wrapper. + Must return model. + :param model: + :param device_ids: + :return: + """ + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + find_unused_parameters=True + ) + return model + + def configure_apex(self, amp, model, optimizers, amp_level): + """ + Override to init AMP your own way + Must return a model and list of optimizers + :param amp: + :param model: + :param optimizers: + :param amp_level: + :return: + """ + model, optimizers = amp.initialize( + model, optimizers, opt_level=amp_level, + ) + + return model, optimizers + def configure_optimizers(self): """ Return a list of optimizers and a list of schedulers (could be empty) diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 36345b7459687..bcfc6f436b28f 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -177,11 +177,8 @@ def ddp_train(self, gpu_nb, model): elif self.use_ddp2: device_ids = None - model = LightningDistributedDataParallel( - model, - device_ids=device_ids, - find_unused_parameters=True - ) + # allow user to configure ddp + model = model.configure_ddp(model, device_ids) # continue training routine self.run_pretrain_routine(model) From 5417f67772fe0c6ea2b8bfbffaafb1e7981fa98d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 2 Nov 2019 13:48:39 -0400 Subject: [PATCH 14/62] allow ddp and apex to be configured --- pytorch_lightning/trainer/ddp_mixin.py | 4 +--- pytorch_lightning/trainer/dp_mixin.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index bcfc6f436b28f..02978203b5839 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -166,9 +166,7 @@ def ddp_train(self, gpu_nb, model): # run through amp wrapper before going to distributed DP if self.use_amp: # An example - model, optimizers = amp.initialize( - model, self.optimizers, opt_level=self.amp_level, - ) + model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # DDP2 uses all GPUs on the machine diff --git a/pytorch_lightning/trainer/dp_mixin.py b/pytorch_lightning/trainer/dp_mixin.py index 0bde8a7b1315a..684ff15c6989b 100644 --- a/pytorch_lightning/trainer/dp_mixin.py +++ b/pytorch_lightning/trainer/dp_mixin.py @@ -71,9 +71,7 @@ def single_gpu_train(self, model): if self.use_amp: # An example - model, optimizers = amp.initialize( - model, self.optimizers, opt_level=self.amp_level, - ) + model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers self.run_pretrain_routine(model) From 0ba06401e26016056624d83b453bb3a328f78899 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 05:50:27 -0500 Subject: [PATCH 15/62] bananas --- pytorch_lightning/trainer/train_loop_mixin.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index a54f0ae806bd9..58ffe63d11d65 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -283,9 +283,6 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): elif self.use_ddp2 or self.use_dp: # in dp, allow model to use training_step and training_end output = self.model(*args) - if self.is_overriden('training_end'): - model_ref = self.get_model() - output = model_ref.training_end(output) elif self.single_gpu: gpu_id = 0 @@ -298,6 +295,12 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): else: output = self.model.training_step(*args) + # allow any mode to define training_end + # dp and ddp2 need it but optional for all others + if self.is_overriden('training_end'): + model_ref = self.get_model() + output = model_ref.training_end(output) + # format and reduce outputs accordingly output = self.process_output(output, train=True) From 8fc9da65c84b7feb8d1a635818b1ef765a7f4037 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:01:36 -0500 Subject: [PATCH 16/62] bananas --- .../RequiredTrainerInterface.md | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index e029961516540..c4bc074fab965 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -178,6 +178,90 @@ def training_step(self, batch, batch_nb, hiddens): You can also return a -1 instead of a dict to stop the current loop. This is useful if you want to break out of the current training epoch early. +--- +### training_end + +``` {.python} +def training_end(self, train_step_outputs) +``` +In certain cases (dp, ddp2), you might want to use all outputs of every process to do something. +For instance, you could run a batch via dp and use ALL the outputs for a single softmax across +the full batch. + +In this case you should define training_end to perform those calculations. + + +**Params** + +| Param | description | +|---|---| +| outputs | What you return in training_step. + +**Return** + +Dictionary or OrderedDict + +| key | value | is required | +|---|---|---| +| loss | tensor scalar | Y | +| progress_bar | Dict for progress bar display. Must have only tensors | N | +| log | Dict of metrics to add to logger. Must have only tensors (no images, etc) | N | + + +**Example** + +``` {.python} +# WITHOUT training_end +# if used in DP or DDP2, this batch is 1/nb_gpus large +def training_step(self, batch, batch_nb): + # batch is 1/nb_gpus big + x, y = batch + + out = self.forward(x) + loss = self.softmax(out) + loss = my_loss(loss, x) + return {'loss': loss} + +# -------------- +# with training_end to do softmax over the full batch +def training_step(self, batch, batch_nb): + # batch is 1/nb_gpus big + x, y = batch + + out = self.forward(x) + return {'out': out, 'y': y} + +def training_end(self, outputs): + # this out is now the full size of the batch + out = outputs['out'] + y = outputs['y'] + + # this softmax now uses the full batch size + loss = self.softmax(out) + loss = my_loss(loss, y) + return {'loss': loss} +``` + +If you define multiple optimizers, this step will also be called with an additional ```optimizer_idx``` param. +``` {.python} +# Multiple optimizers (ie: GANs) +def training_step(self, batch, batch_nb, optimizer_idx): + if optimizer_idx == 0: + # do training_step with encoder + if optimizer_idx == 1: + # do training_step with decoder +``` + +If you add truncated back propagation through time you will also get an additional argument with the hidden states of the previous step. +``` {.python} +# Truncated back-propagation through time +def training_step(self, batch, batch_nb, hiddens): + # hiddens are the hiddens from the previous truncated backprop step +``` + +You can also return a -1 instead of a dict to stop the current loop. This is useful if you want to +break out of the current training epoch early. + --- ### train_dataloader From 5277cc88cd9b64613a55c6526e6c15d3bf45d632 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:03:17 -0500 Subject: [PATCH 17/62] bananas --- docs/LightningModule/RequiredTrainerInterface.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index c4bc074fab965..efaccd3e34ff6 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -185,8 +185,8 @@ break out of the current training epoch early. def training_end(self, train_step_outputs) ``` In certain cases (dp, ddp2), you might want to use all outputs of every process to do something. -For instance, you could run a batch via dp and use ALL the outputs for a single softmax across -the full batch. +For instance, if using negative samples, you could run a batch via dp and use ALL the outputs +for a single softmax across the full batch (ie: the denominator would use the full batch). In this case you should define training_end to perform those calculations. @@ -219,7 +219,7 @@ def training_step(self, batch, batch_nb): out = self.forward(x) loss = self.softmax(out) - loss = my_loss(loss, x) + loss = nce_loss(loss) return {'loss': loss} # -------------- @@ -229,16 +229,15 @@ def training_step(self, batch, batch_nb): x, y = batch out = self.forward(x) - return {'out': out, 'y': y} + return {'out': out} def training_end(self, outputs): # this out is now the full size of the batch out = outputs['out'] - y = outputs['y'] # this softmax now uses the full batch size loss = self.softmax(out) - loss = my_loss(loss, y) + loss = nce_loss(loss) return {'loss': loss} ``` From 661e925e382fd3969bfb69fe979457890bfc53d3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:03:43 -0500 Subject: [PATCH 18/62] bananas --- docs/LightningModule/RequiredTrainerInterface.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index efaccd3e34ff6..7d8d1be3195ad 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -15,6 +15,7 @@ Otherwise, to Define a Lightning Module, implement the following methods: **Optional**: +- [training_end](RequiredTrainerInterface.md#training_end) - [validation_step](RequiredTrainerInterface.md#validation_step) - [validation_end](RequiredTrainerInterface.md#validation_end) - [test_step](RequiredTrainerInterface.md#test_step) From 2914120567ad11d0903d5745ae0ccb7288a8379c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:06:23 -0500 Subject: [PATCH 19/62] bananas --- pytorch_lightning/root_module/root_module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 5d4038f8cf7c0..be246a4a49adf 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -48,16 +48,16 @@ def training_step(self, *args, **kwargs): return loss, dict with metrics for tqdm :param called with batch, batch_nb additional: optimizer_i if multiple optimizers used - :return: + :return: dict with loss key and optional log, progress keys + if implementing training_step, return whatever you need in that step """ raise NotImplementedError def training_end(self, *args, **kwargs): """ return loss, dict with metrics for tqdm - :param called with batch, batch_nb - additional: optimizer_i if multiple optimizers used - :return: + :param called with outputs of training_step + :return: dict with loss key and optional log, progress keys """ pass From 6a04ac147781f344cd0ff0f3c58c7e4a8fdbe4c9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:07:32 -0500 Subject: [PATCH 20/62] bananas --- pytorch_lightning/root_module/root_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index be246a4a49adf..4d3027bdaa7c5 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -105,7 +105,7 @@ def configure_ddp(self, model, device_ids): Must return model. :param model: :param device_ids: - :return: + :return: DDP wrapped model """ model = LightningDistributedDataParallel( model, @@ -122,7 +122,7 @@ def configure_apex(self, amp, model, optimizers, amp_level): :param model: :param optimizers: :param amp_level: - :return: + :return: Apex wrapped model and optimizers """ model, optimizers = amp.initialize( model, optimizers, opt_level=amp_level, From fddcf9d417ddb56d70a385b209e6fe33a57b9161 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:16:26 -0500 Subject: [PATCH 21/62] bananas --- docs/Trainer/hooks.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 96c41e2ba991b..54cf96c23a67e 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -175,3 +175,29 @@ def tbptt_split_batch(self, batch, split_size): return splits ``` + +--- +#### configure_ddp +Overwrite to define your own DDP implementation init. +The only requirement is that: +1. On a validation batch the call goes to model.validation_step. +2. On a training batch the call goes to model.training_step. +3. On a testing batch, the call goes to model.test_step + +```python +def configure_ddp(self, model, device_ids): + """ + Override to init DDP in a different way or use your own wrapper. + Must return model. + :param model: + :param device_ids: + :return: DDP wrapped model + """ + # Lightning DDP simply routes to test_step, val_step, etc... + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + find_unused_parameters=True + ) + return model +``` From 089c709d7a51b4877290687f9672ba0a068f5852 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:17:25 -0500 Subject: [PATCH 22/62] bananas --- docs/Trainer/hooks.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 54cf96c23a67e..725bee2aedf4d 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -176,6 +176,28 @@ def tbptt_split_batch(self, batch, split_size): return splits ``` +--- +#### configure_apex +Overwrite to define your own Apex implementation init. + +```python +def configure_apex(self, amp, model, optimizers, amp_level): + """ + Override to init AMP your own way + Must return a model and list of optimizers + :param amp: + :param model: + :param optimizers: + :param amp_level: + :return: Apex wrapped model and optimizers + """ + model, optimizers = amp.initialize( + model, optimizers, opt_level=amp_level, + ) + + return model, optimizers +``` + --- #### configure_ddp Overwrite to define your own DDP implementation init. From f602c3a305c0f168ceaece91e7c92a6a3b12f7fd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:23:37 -0500 Subject: [PATCH 23/62] bananas --- pytorch_lightning/root_module/root_module.py | 39 +++++++++++++++++++ pytorch_lightning/trainer/ddp_mixin.py | 40 +------------------- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 4d3027bdaa7c5..fbf62ab8910ab 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -1,3 +1,4 @@ +import os import warnings import collections from argparse import Namespace @@ -114,6 +115,44 @@ def configure_ddp(self, model, device_ids): ) return model + def init_ddp_connection(self, dist): + """ + Connect all procs in the world using the env:// init + Use the first node as the root address + :param port: + :param tries: + :return: + """ + + # use slurm job id for the port number + # guarantees unique ports across jobs from same grid search + try: + # use the last 4 numbers in the job id as the id + default_port = os.environ['SLURM_JOB_ID'] + default_port = default_port[-4:] + + # all ports should be in the 10k+ range + default_port = int(default_port) + 15000 + + except Exception as e: + default_port = 12910 + + # if user gave a port number, use that one instead + try: + default_port = os.environ['MASTER_PORT'] + except Exception: + os.environ['MASTER_PORT'] = str(default_port) + + # figure out the root node addr + try: + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] + except Exception: + root_node = '127.0.0.2' + + root_node = self.trainer.resolve_root_node_address(root_node) + os.environ['MASTER_ADDR'] = root_node + dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) + def configure_apex(self, amp, model, optimizers, amp_level): """ Override to init AMP your own way diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 02978203b5839..807df1555cae3 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -144,7 +144,7 @@ def ddp_train(self, gpu_nb, model): # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table - self.__init_tcp_connection() + model.init_ddp_connection() # CHOOSE OPTIMIZER # allow for lr schedulers as well @@ -181,44 +181,6 @@ def ddp_train(self, gpu_nb, model): # continue training routine self.run_pretrain_routine(model) - def __init_tcp_connection(self): - """ - Connect all procs in the world using the env:// init - Use the first node as the root address - :param port: - :param tries: - :return: - """ - - # use slurm job id for the port number - # guarantees unique ports across jobs from same grid search - try: - # use the last 4 numbers in the job id as the id - default_port = os.environ['SLURM_JOB_ID'] - default_port = default_port[-4:] - - # all ports should be in the 10k+ range - default_port = int(default_port) + 15000 - - except Exception as e: - default_port = 12910 - - # if user gave a port number, use that one instead - try: - default_port = os.environ['MASTER_PORT'] - except Exception: - os.environ['MASTER_PORT'] = str(default_port) - - # figure out the root node addr - try: - root_node = os.environ['SLURM_NODELIST'].split(' ')[0] - except Exception: - root_node = '127.0.0.2' - - root_node = self.resolve_root_node_address(root_node) - os.environ['MASTER_ADDR'] = root_node - dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) - def resolve_root_node_address(self, root_node): if '[' in root_node: name = root_node.split('[')[0] From 4a283c0fd47df8eebc48c801b0b88dd775dfd914 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:25:35 -0500 Subject: [PATCH 24/62] bananas --- docs/Trainer/hooks.md | 41 ++++++++++++++++++++ pytorch_lightning/root_module/root_module.py | 6 +-- pytorch_lightning/trainer/ddp_mixin.py | 2 - 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 725bee2aedf4d..5f1baf84b6ebf 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -223,3 +223,44 @@ def configure_ddp(self, model, device_ids): ) return model ``` + +--- +#### init_ddp_connection +Override to init DDP in your own way. + +```python +def init_ddp_connection(self): + """ + Connect all procs in the world using the env:// init + Use the first node as the root address + """ + + # use slurm job id for the port number + # guarantees unique ports across jobs from same grid search + try: + # use the last 4 numbers in the job id as the id + default_port = os.environ['SLURM_JOB_ID'] + default_port = default_port[-4:] + + # all ports should be in the 10k+ range + default_port = int(default_port) + 15000 + + except Exception as e: + default_port = 12910 + + # if user gave a port number, use that one instead + try: + default_port = os.environ['MASTER_PORT'] + except Exception: + os.environ['MASTER_PORT'] = str(default_port) + + # figure out the root node addr + try: + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] + except Exception: + root_node = '127.0.0.2' + + root_node = self.trainer.resolve_root_node_address(root_node) + os.environ['MASTER_ADDR'] = root_node + dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) +``` diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index fbf62ab8910ab..44cda85b5369f 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -4,6 +4,7 @@ from argparse import Namespace import torch +import torch.distributed as dist from pytorch_lightning.root_module.decorators import data_loader from pytorch_lightning.root_module.grads import GradInformation @@ -115,13 +116,10 @@ def configure_ddp(self, model, device_ids): ) return model - def init_ddp_connection(self, dist): + def init_ddp_connection(self): """ Connect all procs in the world using the env:// init Use the first node as the root address - :param port: - :param tries: - :return: """ # use slurm job id for the port number diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 807df1555cae3..12520cae9b019 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -3,9 +3,7 @@ import warnings import torch -import torch.distributed as dist -from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel from pytorch_lightning.utilities.debugging import MisconfigurationException try: From fe150ff5de3c8058cea56b69112651200e240f51 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:27:47 -0500 Subject: [PATCH 25/62] bananas --- README.md | 1 + docs/Trainer/index.md | 1 + docs/index.md | 1 + 3 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 1797c2b217acc..2c08869933f64 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,7 @@ Lightning also adds a text column with all the hyperparameters for this experime #### Distributed training +- [Implement Your Own Distributed (DDP) training](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/#init_ddp_connection) - [16-bit mixed precision](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#16-bit-mixed-precision) - [Multi-GPU](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-GPU) - [Multi-node](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-node) diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index d71c07ed1e6e5..cf8d613d65eda 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -42,6 +42,7 @@ But of course the fun is in all the advanced things it can do: **Distributed training** +- [Implement Your Own Distributed (DDP) training](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/#init_ddp_connection) - [16-bit mixed precision](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#16-bit-mixed-precision) - [Multi-GPU](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-GPU) - [Multi-node](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-node) diff --git a/docs/index.md b/docs/index.md index 0a8e8cda1898c..3e41ede7bbb61 100644 --- a/docs/index.md +++ b/docs/index.md @@ -99,6 +99,7 @@ Notice a few things about this flow: ###### Distributed training +- [Implement Your Own Distributed (DDP) training](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/#init_ddp_connection) - [16-bit mixed precision](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#16-bit-mixed-precision) - [Multi-GPU](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-GPU) - [Multi-node](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-node) From c1c042fbef03a30689d6256198a75ccdfe1f7915 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:35:33 -0500 Subject: [PATCH 26/62] bananas --- pytorch_lightning/root_module/root_module.py | 1 + pytorch_lightning/trainer/ddp_mixin.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 44cda85b5369f..faf2458d61d49 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -13,6 +13,7 @@ from pytorch_lightning.root_module.model_saving import ModelIO from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel +from pytorch_lightning.trainer.ddp_mixin import TrainerDDPMixin class LightningModule(GradInformation, ModelIO, ModelHooks): diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 12520cae9b019..ab28101a5224a 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -139,6 +139,9 @@ def ddp_train(self, gpu_nb, model): if self.logger is not None: self.logger.rank = self.proc_rank + # set model properties before going into wrapper + self.copy_trainer_model_properties(model) + # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table From bf214e4231e8aca38130d3baa8d7427ba50a67b5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:36:09 -0500 Subject: [PATCH 27/62] bananas --- pytorch_lightning/root_module/root_module.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index faf2458d61d49..44cda85b5369f 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -13,7 +13,6 @@ from pytorch_lightning.root_module.model_saving import ModelIO from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel -from pytorch_lightning.trainer.ddp_mixin import TrainerDDPMixin class LightningModule(GradInformation, ModelIO, ModelHooks): From e552cd67e192a088f14ec1360c9354505b0d1511 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:37:22 -0500 Subject: [PATCH 28/62] bananas --- pytorch_lightning/trainer/ddp_mixin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index ab28101a5224a..a1fbe1f71b1f2 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -139,12 +139,10 @@ def ddp_train(self, gpu_nb, model): if self.logger is not None: self.logger.rank = self.proc_rank - # set model properties before going into wrapper - self.copy_trainer_model_properties(model) - # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table + model.trainer = self model.init_ddp_connection() # CHOOSE OPTIMIZER From a895c45ec40837033f0efaba2b0bb18db5aef006 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:39:14 -0500 Subject: [PATCH 29/62] bananas --- pytorch_lightning/root_module/root_module.py | 4 ++-- pytorch_lightning/trainer/ddp_mixin.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 44cda85b5369f..f2023db9330e7 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -116,7 +116,7 @@ def configure_ddp(self, model, device_ids): ) return model - def init_ddp_connection(self): + def init_ddp_connection(self, proc_rank, world_size): """ Connect all procs in the world using the env:// init Use the first node as the root address @@ -149,7 +149,7 @@ def init_ddp_connection(self): root_node = self.trainer.resolve_root_node_address(root_node) os.environ['MASTER_ADDR'] = root_node - dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) + dist.init_process_group('nccl', rank=proc_rank, world_size=world_size) def configure_apex(self, amp, model, optimizers, amp_level): """ diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index a1fbe1f71b1f2..cabb594c4a7bc 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -143,7 +143,7 @@ def ddp_train(self, gpu_nb, model): # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self - model.init_ddp_connection() + model.init_ddp_connection(self.proc_rank, self.world_size) # CHOOSE OPTIMIZER # allow for lr schedulers as well From b5571d50ad4a1c9c4b68e37fcb55cd6685152d64 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 5 Nov 2019 08:58:49 -0500 Subject: [PATCH 30/62] added eval and train for redundancy --- pytorch_lightning/trainer/train_loop_mixin.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 58ffe63d11d65..3c8a60118c6eb 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -275,15 +275,15 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): if len(self.optimizers) > 1: args.append(opt_idx) + # pass hiddens if using tbptt if self.truncated_bptt_steps is not None: args.append(hiddens) - if self.use_ddp: - output = self.model(*args) - elif self.use_ddp2 or self.use_dp: - # in dp, allow model to use training_step and training_end + # distributed forward + if self.use_ddp or self.use_ddp2 or self.use_dp: output = self.model(*args) + # single GPU forward elif self.single_gpu: gpu_id = 0 if type(self.data_parallel_device_ids) is list: @@ -292,6 +292,7 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): args[0] = batch output = self.model.training_step(*args) + # CPU forward else: output = self.model.training_step(*args) From 233a4fd07c41bd57d0239ff9b0598f53567a92f5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 5 Nov 2019 08:59:45 -0500 Subject: [PATCH 31/62] added eval and train for redundancy --- pytorch_lightning/trainer/train_loop_mixin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 3c8a60118c6eb..0ece499ca2895 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -297,7 +297,6 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): output = self.model.training_step(*args) # allow any mode to define training_end - # dp and ddp2 need it but optional for all others if self.is_overriden('training_end'): model_ref = self.get_model() output = model_ref.training_end(output) From 33f94b3099d3c71c32b2e7615a9ad916e50ececa Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 16:40:35 -0400 Subject: [PATCH 32/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index d41fc7c16fee6..c1e221a44f676 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -280,10 +280,15 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): if self.truncated_bptt_steps is not None: args.append(hiddens) - if self.use_ddp or self.use_ddp2: + if self.use_ddp: output = self.model(*args) - elif self.use_dp: + elif self.use_ddp2 or self.use_dp: + # in dp, allow model to use training_step and training_end output = self.model(*args) + if self.is_overriden('training_end'): + model_ref = self.get_model() + output = model_ref.training_end(output) + elif self.single_gpu: gpu_id = 0 if type(self.data_parallel_device_ids) is list: From 96f1670b4f6c1629dcb94a0e336a9125b8255b23 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 16:41:33 -0400 Subject: [PATCH 33/62] added training_end --- pytorch_lightning/root_module/root_module.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index b25ab9e49443f..f68257cbc5d8e 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -52,6 +52,15 @@ def training_step(self, *args, **kwargs): """ raise NotImplementedError + def training_end(self, *args, **kwargs): + """ + return loss, dict with metrics for tqdm + :param called with batch, batch_nb + additional: optimizer_i if multiple optimizers used + :return: + """ + pass + def validation_step(self, *args, **kwargs): """ return whatever outputs will need to be aggregated in validation_end From a1f9318c6395a0505a810c141a61356d30eee918 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:06:31 -0400 Subject: [PATCH 34/62] added training_end --- pytorch_lightning/trainer/logging_mixin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/logging_mixin.py b/pytorch_lightning/trainer/logging_mixin.py index 236e512d781ca..03057c208ec1e 100644 --- a/pytorch_lightning/trainer/logging_mixin.py +++ b/pytorch_lightning/trainer/logging_mixin.py @@ -68,6 +68,8 @@ def process_output(self, output, train=False): callback_metrics[k] = v if train and (self.use_dp or self.use_ddp2): + import pdb + pdb.set_trace() nb_gpus = self.num_gpus callback_metrics = self.reduce_distributed_output(callback_metrics, nb_gpus) From 5624fbcf67f0b630319c294c66cc6e9cb39548aa Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:15:28 -0400 Subject: [PATCH 35/62] added training_end --- pytorch_lightning/trainer/logging_mixin.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/logging_mixin.py b/pytorch_lightning/trainer/logging_mixin.py index 03057c208ec1e..13e5afb6f6f07 100644 --- a/pytorch_lightning/trainer/logging_mixin.py +++ b/pytorch_lightning/trainer/logging_mixin.py @@ -68,8 +68,6 @@ def process_output(self, output, train=False): callback_metrics[k] = v if train and (self.use_dp or self.use_ddp2): - import pdb - pdb.set_trace() nb_gpus = self.num_gpus callback_metrics = self.reduce_distributed_output(callback_metrics, nb_gpus) @@ -158,6 +156,10 @@ def reduce_distributed_output(self, output, nb_gpus): if isinstance(output[k], dict): output[k] = self.reduce_distributed_output(output[k], nb_gpus) + # do nothing when there's a scalar + elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0: + pass + # reduce only metrics that have the same nb of gpus elif output[k].size(0) == nb_gpus: reduced = torch.mean(output[k]) From c70d9c4b8173f757d15109b9d515fb032783f039 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:18:30 -0400 Subject: [PATCH 36/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index c1e221a44f676..62edf0b80583b 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -302,4 +302,8 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) + + import pdb + pdb.set_trace() + return output From 2117fec3d150fb30799d0841b6ac4d30b66c0432 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 31 Oct 2019 17:23:42 -0400 Subject: [PATCH 37/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 62edf0b80583b..21219bd841c85 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -303,7 +303,4 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) - import pdb - pdb.set_trace() - return output From 0d77e21a1068893d173d00354d508e8cfb59a767 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:33:17 -0400 Subject: [PATCH 38/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 21219bd841c85..ac0b59ca3676b 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -189,13 +189,6 @@ def optimizer_closure(): callback_metrics = output[3] self.hiddens = output[4] - # track metrics for callbacks - all_callback_metrics.append(callback_metrics) - - # track progress bar metrics - self.add_tqdm_metrics(progress_bar_metrics) - all_log_metrics.append(log_metrics) - # accumulate loss # (if accumulate_grad_batches = 1 no effect) closure_loss = closure_loss / self.accumulate_grad_batches @@ -204,6 +197,13 @@ def optimizer_closure(): model_ref = self.get_model() model_ref.backward(self.use_amp, closure_loss, optimizer) + # track metrics for callbacks + all_callback_metrics.append(callback_metrics) + + # track progress bar metrics + self.add_tqdm_metrics(progress_bar_metrics) + all_log_metrics.append(log_metrics) + # insert after step hook if self.is_function_implemented('on_after_backward'): model_ref = self.get_model() From cbd81894017e552b5b52504e5f0c5f99e192d898 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:34:35 -0400 Subject: [PATCH 39/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index ac0b59ca3676b..7eb5eab2b6cb6 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -180,22 +180,24 @@ def run_training_batch(self, batch, batch_nb): # wrap the forward step in a closure so second order methods work def optimizer_closure(): # forward pass - output = self.training_forward( - split_batch, batch_nb, opt_idx, self.hiddens) - - closure_loss = output[0] - progress_bar_metrics = output[1] - log_metrics = output[2] - callback_metrics = output[3] - self.hiddens = output[4] - - # accumulate loss - # (if accumulate_grad_batches = 1 no effect) - closure_loss = closure_loss / self.accumulate_grad_batches - - # backward pass - model_ref = self.get_model() - model_ref.backward(self.use_amp, closure_loss, optimizer) + import torch + with torch.autograd.detect_anomaly(): + output = self.training_forward( + split_batch, batch_nb, opt_idx, self.hiddens) + + closure_loss = output[0] + progress_bar_metrics = output[1] + log_metrics = output[2] + callback_metrics = output[3] + self.hiddens = output[4] + + # accumulate loss + # (if accumulate_grad_batches = 1 no effect) + closure_loss = closure_loss / self.accumulate_grad_batches + + # backward pass + model_ref = self.get_model() + model_ref.backward(self.use_amp, closure_loss, optimizer) # track metrics for callbacks all_callback_metrics.append(callback_metrics) From e214ca80dc7166b6643a9956a3e97eaa65d743a9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:39:35 -0400 Subject: [PATCH 40/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 7eb5eab2b6cb6..1de8e79e6e76c 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -196,8 +196,11 @@ def optimizer_closure(): closure_loss = closure_loss / self.accumulate_grad_batches # backward pass - model_ref = self.get_model() - model_ref.backward(self.use_amp, closure_loss, optimizer) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + + # model_ref = self.get_model() + # model_ref.backward(self.use_amp, closure_loss, optimizer) # track metrics for callbacks all_callback_metrics.append(callback_metrics) From 8d5dca0da8ca597f0ccd2a776f173e08dd01ad37 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:42:16 -0400 Subject: [PATCH 41/62] added training_end --- pytorch_lightning/trainer/train_loop_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 1de8e79e6e76c..91687ba2e2985 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -196,7 +196,7 @@ def optimizer_closure(): closure_loss = closure_loss / self.accumulate_grad_batches # backward pass - with amp.scale_loss(loss, optimizer) as scaled_loss: + with amp.scale_loss(closure_loss, optimizer) as scaled_loss: scaled_loss.backward() # model_ref = self.get_model() From ebd3c3b1bc73af9e96460a7e9e55148a96aa115a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:46:27 -0400 Subject: [PATCH 42/62] added training_end --- .../trainer/evaluation_loop_mixin.py | 6 +-- pytorch_lightning/trainer/train_loop_mixin.py | 37 ++++++++----------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop_mixin.py b/pytorch_lightning/trainer/evaluation_loop_mixin.py index c2ba5d16fa97a..9cb6fd9960e6a 100644 --- a/pytorch_lightning/trainer/evaluation_loop_mixin.py +++ b/pytorch_lightning/trainer/evaluation_loop_mixin.py @@ -73,12 +73,12 @@ def evaluate(self, model, dataloaders, max_batches, test=False): elif self.is_overriden('validation_end'): eval_results = model.validation_end(outputs) - # enable train mode again - model.train() - # enable gradients to save memory torch.set_grad_enabled(True) + # enable train mode again + model.train() + return eval_results def run_evaluation(self, test=False): diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 91687ba2e2985..ac0b59ca3676b 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -180,27 +180,22 @@ def run_training_batch(self, batch, batch_nb): # wrap the forward step in a closure so second order methods work def optimizer_closure(): # forward pass - import torch - with torch.autograd.detect_anomaly(): - output = self.training_forward( - split_batch, batch_nb, opt_idx, self.hiddens) - - closure_loss = output[0] - progress_bar_metrics = output[1] - log_metrics = output[2] - callback_metrics = output[3] - self.hiddens = output[4] - - # accumulate loss - # (if accumulate_grad_batches = 1 no effect) - closure_loss = closure_loss / self.accumulate_grad_batches - - # backward pass - with amp.scale_loss(closure_loss, optimizer) as scaled_loss: - scaled_loss.backward() - - # model_ref = self.get_model() - # model_ref.backward(self.use_amp, closure_loss, optimizer) + output = self.training_forward( + split_batch, batch_nb, opt_idx, self.hiddens) + + closure_loss = output[0] + progress_bar_metrics = output[1] + log_metrics = output[2] + callback_metrics = output[3] + self.hiddens = output[4] + + # accumulate loss + # (if accumulate_grad_batches = 1 no effect) + closure_loss = closure_loss / self.accumulate_grad_batches + + # backward pass + model_ref = self.get_model() + model_ref.backward(self.use_amp, closure_loss, optimizer) # track metrics for callbacks all_callback_metrics.append(callback_metrics) From 7330dece6880838b26d75f085a56acbc46e20f12 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 1 Nov 2019 15:48:10 -0400 Subject: [PATCH 43/62] added training_end --- pytorch_lightning/trainer/evaluation_loop_mixin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop_mixin.py b/pytorch_lightning/trainer/evaluation_loop_mixin.py index 9cb6fd9960e6a..c2ba5d16fa97a 100644 --- a/pytorch_lightning/trainer/evaluation_loop_mixin.py +++ b/pytorch_lightning/trainer/evaluation_loop_mixin.py @@ -73,12 +73,12 @@ def evaluate(self, model, dataloaders, max_batches, test=False): elif self.is_overriden('validation_end'): eval_results = model.validation_end(outputs) - # enable gradients to save memory - torch.set_grad_enabled(True) - # enable train mode again model.train() + # enable gradients to save memory + torch.set_grad_enabled(True) + return eval_results def run_evaluation(self, test=False): From 0cfcc50e77ce9538a2a39d9767849f6d7a4c2b9b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 2 Nov 2019 13:46:25 -0400 Subject: [PATCH 44/62] allow ddp and apex to be configured --- pytorch_lightning/root_module/root_module.py | 32 ++++++++++++++++++++ pytorch_lightning/trainer/ddp_mixin.py | 7 ++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index f68257cbc5d8e..e82b70b9c045b 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -11,6 +11,7 @@ from pytorch_lightning.root_module.model_saving import ModelIO from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv import logging +from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel class LightningModule(GradInformation, ModelIO, ModelHooks): @@ -99,6 +100,37 @@ def test_end(self, outputs): """ pass + def configure_ddp(self, model, device_ids): + """ + Override to init DDP in a different way or use your own wrapper. + Must return model. + :param model: + :param device_ids: + :return: + """ + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + find_unused_parameters=True + ) + return model + + def configure_apex(self, amp, model, optimizers, amp_level): + """ + Override to init AMP your own way + Must return a model and list of optimizers + :param amp: + :param model: + :param optimizers: + :param amp_level: + :return: + """ + model, optimizers = amp.initialize( + model, optimizers, opt_level=amp_level, + ) + + return model, optimizers + def configure_optimizers(self): """ Return a list of optimizers and a list of schedulers (could be empty) diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 9f6c1ad4199b5..b75855a26256c 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -178,11 +178,8 @@ def ddp_train(self, gpu_nb, model): elif self.use_ddp2: device_ids = None - model = LightningDistributedDataParallel( - model, - device_ids=device_ids, - find_unused_parameters=True - ) + # allow user to configure ddp + model = model.configure_ddp(model, device_ids) # continue training routine self.run_pretrain_routine(model) From 5551586457fa534f4a18fb228e2ab5821fc02d79 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 2 Nov 2019 13:48:39 -0400 Subject: [PATCH 45/62] allow ddp and apex to be configured --- pytorch_lightning/trainer/ddp_mixin.py | 4 +--- pytorch_lightning/trainer/dp_mixin.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index b75855a26256c..ab0b84ed75bc6 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -167,9 +167,7 @@ def ddp_train(self, gpu_nb, model): # run through amp wrapper before going to distributed DP if self.use_amp: # An example - model, optimizers = amp.initialize( - model, self.optimizers, opt_level=self.amp_level, - ) + model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # DDP2 uses all GPUs on the machine diff --git a/pytorch_lightning/trainer/dp_mixin.py b/pytorch_lightning/trainer/dp_mixin.py index 0bde8a7b1315a..684ff15c6989b 100644 --- a/pytorch_lightning/trainer/dp_mixin.py +++ b/pytorch_lightning/trainer/dp_mixin.py @@ -71,9 +71,7 @@ def single_gpu_train(self, model): if self.use_amp: # An example - model, optimizers = amp.initialize( - model, self.optimizers, opt_level=self.amp_level, - ) + model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers self.run_pretrain_routine(model) From 274a6be6656237fc696d2f8d1643cf610707b167 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 05:50:27 -0500 Subject: [PATCH 46/62] bananas --- pytorch_lightning/trainer/train_loop_mixin.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index ac0b59ca3676b..5d0e977be69a8 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -285,9 +285,6 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): elif self.use_ddp2 or self.use_dp: # in dp, allow model to use training_step and training_end output = self.model(*args) - if self.is_overriden('training_end'): - model_ref = self.get_model() - output = model_ref.training_end(output) elif self.single_gpu: gpu_id = 0 @@ -300,6 +297,12 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): else: output = self.model.training_step(*args) + # allow any mode to define training_end + # dp and ddp2 need it but optional for all others + if self.is_overriden('training_end'): + model_ref = self.get_model() + output = model_ref.training_end(output) + # format and reduce outputs accordingly output = self.process_output(output, train=True) From 2260facaa5c7396127e4e92c8bf7fb113267143a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:01:36 -0500 Subject: [PATCH 47/62] bananas --- .../RequiredTrainerInterface.md | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index 53bfc821910d4..a902a26092ed7 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -178,6 +178,90 @@ def training_step(self, batch, batch_nb, hiddens): You can also return a -1 instead of a dict to stop the current loop. This is useful if you want to break out of the current training epoch early. +--- +### training_end + +``` {.python} +def training_end(self, train_step_outputs) +``` +In certain cases (dp, ddp2), you might want to use all outputs of every process to do something. +For instance, you could run a batch via dp and use ALL the outputs for a single softmax across +the full batch. + +In this case you should define training_end to perform those calculations. + + +**Params** + +| Param | description | +|---|---| +| outputs | What you return in training_step. + +**Return** + +Dictionary or OrderedDict + +| key | value | is required | +|---|---|---| +| loss | tensor scalar | Y | +| progress_bar | Dict for progress bar display. Must have only tensors | N | +| log | Dict of metrics to add to logger. Must have only tensors (no images, etc) | N | + + +**Example** + +``` {.python} +# WITHOUT training_end +# if used in DP or DDP2, this batch is 1/nb_gpus large +def training_step(self, batch, batch_nb): + # batch is 1/nb_gpus big + x, y = batch + + out = self.forward(x) + loss = self.softmax(out) + loss = my_loss(loss, x) + return {'loss': loss} + +# -------------- +# with training_end to do softmax over the full batch +def training_step(self, batch, batch_nb): + # batch is 1/nb_gpus big + x, y = batch + + out = self.forward(x) + return {'out': out, 'y': y} + +def training_end(self, outputs): + # this out is now the full size of the batch + out = outputs['out'] + y = outputs['y'] + + # this softmax now uses the full batch size + loss = self.softmax(out) + loss = my_loss(loss, y) + return {'loss': loss} +``` + +If you define multiple optimizers, this step will also be called with an additional ```optimizer_idx``` param. +``` {.python} +# Multiple optimizers (ie: GANs) +def training_step(self, batch, batch_nb, optimizer_idx): + if optimizer_idx == 0: + # do training_step with encoder + if optimizer_idx == 1: + # do training_step with decoder +``` + +If you add truncated back propagation through time you will also get an additional argument with the hidden states of the previous step. +``` {.python} +# Truncated back-propagation through time +def training_step(self, batch, batch_nb, hiddens): + # hiddens are the hiddens from the previous truncated backprop step +``` + +You can also return a -1 instead of a dict to stop the current loop. This is useful if you want to +break out of the current training epoch early. + --- ### train_dataloader From d473daa2b22f03114f6a1e5524917ed8861bb03e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:03:17 -0500 Subject: [PATCH 48/62] bananas --- docs/LightningModule/RequiredTrainerInterface.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index a902a26092ed7..266050addf8ee 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -185,8 +185,8 @@ break out of the current training epoch early. def training_end(self, train_step_outputs) ``` In certain cases (dp, ddp2), you might want to use all outputs of every process to do something. -For instance, you could run a batch via dp and use ALL the outputs for a single softmax across -the full batch. +For instance, if using negative samples, you could run a batch via dp and use ALL the outputs +for a single softmax across the full batch (ie: the denominator would use the full batch). In this case you should define training_end to perform those calculations. @@ -219,7 +219,7 @@ def training_step(self, batch, batch_nb): out = self.forward(x) loss = self.softmax(out) - loss = my_loss(loss, x) + loss = nce_loss(loss) return {'loss': loss} # -------------- @@ -229,16 +229,15 @@ def training_step(self, batch, batch_nb): x, y = batch out = self.forward(x) - return {'out': out, 'y': y} + return {'out': out} def training_end(self, outputs): # this out is now the full size of the batch out = outputs['out'] - y = outputs['y'] # this softmax now uses the full batch size loss = self.softmax(out) - loss = my_loss(loss, y) + loss = nce_loss(loss) return {'loss': loss} ``` From 25d7351569d7042fd6a6ab4c6b7539a431b0d557 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:03:43 -0500 Subject: [PATCH 49/62] bananas --- docs/LightningModule/RequiredTrainerInterface.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index 266050addf8ee..7881dde4b057a 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -15,6 +15,7 @@ Otherwise, to Define a Lightning Module, implement the following methods: **Optional**: +- [training_end](RequiredTrainerInterface.md#training_end) - [validation_step](RequiredTrainerInterface.md#validation_step) - [validation_end](RequiredTrainerInterface.md#validation_end) - [test_step](RequiredTrainerInterface.md#test_step) From 91982f111d437bea13044a98986c41687fd4ba52 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:06:23 -0500 Subject: [PATCH 50/62] bananas --- pytorch_lightning/root_module/root_module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index e82b70b9c045b..0633a8298e6b9 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -49,16 +49,16 @@ def training_step(self, *args, **kwargs): return loss, dict with metrics for tqdm :param called with batch, batch_nb additional: optimizer_i if multiple optimizers used - :return: + :return: dict with loss key and optional log, progress keys + if implementing training_step, return whatever you need in that step """ raise NotImplementedError def training_end(self, *args, **kwargs): """ return loss, dict with metrics for tqdm - :param called with batch, batch_nb - additional: optimizer_i if multiple optimizers used - :return: + :param called with outputs of training_step + :return: dict with loss key and optional log, progress keys """ pass From a916ab627fe6e376a257c4fea943ece34aa11a0d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:07:32 -0500 Subject: [PATCH 51/62] bananas --- pytorch_lightning/root_module/root_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 0633a8298e6b9..683bad5bc8d5b 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -106,7 +106,7 @@ def configure_ddp(self, model, device_ids): Must return model. :param model: :param device_ids: - :return: + :return: DDP wrapped model """ model = LightningDistributedDataParallel( model, @@ -123,7 +123,7 @@ def configure_apex(self, amp, model, optimizers, amp_level): :param model: :param optimizers: :param amp_level: - :return: + :return: Apex wrapped model and optimizers """ model, optimizers = amp.initialize( model, optimizers, opt_level=amp_level, From 9de558af6b77959cd4dd0ad0e2f1e2aca4168ff0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:16:26 -0500 Subject: [PATCH 52/62] bananas --- docs/Trainer/hooks.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 96c41e2ba991b..54cf96c23a67e 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -175,3 +175,29 @@ def tbptt_split_batch(self, batch, split_size): return splits ``` + +--- +#### configure_ddp +Overwrite to define your own DDP implementation init. +The only requirement is that: +1. On a validation batch the call goes to model.validation_step. +2. On a training batch the call goes to model.training_step. +3. On a testing batch, the call goes to model.test_step + +```python +def configure_ddp(self, model, device_ids): + """ + Override to init DDP in a different way or use your own wrapper. + Must return model. + :param model: + :param device_ids: + :return: DDP wrapped model + """ + # Lightning DDP simply routes to test_step, val_step, etc... + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + find_unused_parameters=True + ) + return model +``` From 534d68f679d5b5586aff7b29b126f15e7e1ff2e3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:17:25 -0500 Subject: [PATCH 53/62] bananas --- docs/Trainer/hooks.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 54cf96c23a67e..725bee2aedf4d 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -176,6 +176,28 @@ def tbptt_split_batch(self, batch, split_size): return splits ``` +--- +#### configure_apex +Overwrite to define your own Apex implementation init. + +```python +def configure_apex(self, amp, model, optimizers, amp_level): + """ + Override to init AMP your own way + Must return a model and list of optimizers + :param amp: + :param model: + :param optimizers: + :param amp_level: + :return: Apex wrapped model and optimizers + """ + model, optimizers = amp.initialize( + model, optimizers, opt_level=amp_level, + ) + + return model, optimizers +``` + --- #### configure_ddp Overwrite to define your own DDP implementation init. From c40e8cef4f42258f88480d765e3b6e64c7fcd459 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:23:37 -0500 Subject: [PATCH 54/62] bananas --- pytorch_lightning/root_module/root_module.py | 39 +++++++++++++++++++ pytorch_lightning/trainer/ddp_mixin.py | 40 +------------------- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 683bad5bc8d5b..f732313ec0398 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -1,3 +1,4 @@ +import os import warnings import collections from argparse import Namespace @@ -115,6 +116,44 @@ def configure_ddp(self, model, device_ids): ) return model + def init_ddp_connection(self, dist): + """ + Connect all procs in the world using the env:// init + Use the first node as the root address + :param port: + :param tries: + :return: + """ + + # use slurm job id for the port number + # guarantees unique ports across jobs from same grid search + try: + # use the last 4 numbers in the job id as the id + default_port = os.environ['SLURM_JOB_ID'] + default_port = default_port[-4:] + + # all ports should be in the 10k+ range + default_port = int(default_port) + 15000 + + except Exception as e: + default_port = 12910 + + # if user gave a port number, use that one instead + try: + default_port = os.environ['MASTER_PORT'] + except Exception: + os.environ['MASTER_PORT'] = str(default_port) + + # figure out the root node addr + try: + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] + except Exception: + root_node = '127.0.0.2' + + root_node = self.trainer.resolve_root_node_address(root_node) + os.environ['MASTER_ADDR'] = root_node + dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) + def configure_apex(self, amp, model, optimizers, amp_level): """ Override to init AMP your own way diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index ab0b84ed75bc6..fb775a8403822 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -145,7 +145,7 @@ def ddp_train(self, gpu_nb, model): # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table - self.__init_tcp_connection() + model.init_ddp_connection() # CHOOSE OPTIMIZER # allow for lr schedulers as well @@ -182,44 +182,6 @@ def ddp_train(self, gpu_nb, model): # continue training routine self.run_pretrain_routine(model) - def __init_tcp_connection(self): - """ - Connect all procs in the world using the env:// init - Use the first node as the root address - :param port: - :param tries: - :return: - """ - - # use slurm job id for the port number - # guarantees unique ports across jobs from same grid search - try: - # use the last 4 numbers in the job id as the id - default_port = os.environ['SLURM_JOB_ID'] - default_port = default_port[-4:] - - # all ports should be in the 10k+ range - default_port = int(default_port) + 15000 - - except Exception as e: - default_port = 12910 - - # if user gave a port number, use that one instead - try: - default_port = os.environ['MASTER_PORT'] - except Exception: - os.environ['MASTER_PORT'] = str(default_port) - - # figure out the root node addr - try: - root_node = os.environ['SLURM_NODELIST'].split(' ')[0] - except Exception: - root_node = '127.0.0.2' - - root_node = self.resolve_root_node_address(root_node) - os.environ['MASTER_ADDR'] = root_node - dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) - def resolve_root_node_address(self, root_node): if '[' in root_node: name = root_node.split('[')[0] From caaca3b3c1f7c2e17e119b18c76707e4bf1bee48 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:25:35 -0500 Subject: [PATCH 55/62] bananas --- docs/Trainer/hooks.md | 41 ++++++++++++++++++++ pytorch_lightning/root_module/root_module.py | 6 +-- pytorch_lightning/trainer/ddp_mixin.py | 2 - 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 725bee2aedf4d..5f1baf84b6ebf 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -223,3 +223,44 @@ def configure_ddp(self, model, device_ids): ) return model ``` + +--- +#### init_ddp_connection +Override to init DDP in your own way. + +```python +def init_ddp_connection(self): + """ + Connect all procs in the world using the env:// init + Use the first node as the root address + """ + + # use slurm job id for the port number + # guarantees unique ports across jobs from same grid search + try: + # use the last 4 numbers in the job id as the id + default_port = os.environ['SLURM_JOB_ID'] + default_port = default_port[-4:] + + # all ports should be in the 10k+ range + default_port = int(default_port) + 15000 + + except Exception as e: + default_port = 12910 + + # if user gave a port number, use that one instead + try: + default_port = os.environ['MASTER_PORT'] + except Exception: + os.environ['MASTER_PORT'] = str(default_port) + + # figure out the root node addr + try: + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] + except Exception: + root_node = '127.0.0.2' + + root_node = self.trainer.resolve_root_node_address(root_node) + os.environ['MASTER_ADDR'] = root_node + dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) +``` diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index f732313ec0398..70087d86acfc8 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -4,6 +4,7 @@ from argparse import Namespace import torch +import torch.distributed as dist from pytorch_lightning.root_module.decorators import data_loader from pytorch_lightning.root_module.grads import GradInformation @@ -116,13 +117,10 @@ def configure_ddp(self, model, device_ids): ) return model - def init_ddp_connection(self, dist): + def init_ddp_connection(self): """ Connect all procs in the world using the env:// init Use the first node as the root address - :param port: - :param tries: - :return: """ # use slurm job id for the port number diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index fb775a8403822..da0d17d0179ad 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -4,9 +4,7 @@ import logging import torch -import torch.distributed as dist -from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel from pytorch_lightning.utilities.debugging import MisconfigurationException try: From b9556a3520676d1bfeb257280cd9bfb37b73a85d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:27:47 -0500 Subject: [PATCH 56/62] bananas --- README.md | 1 + docs/Trainer/index.md | 1 + docs/index.md | 1 + 3 files changed, 3 insertions(+) diff --git a/README.md b/README.md index f0cd30db5d40c..0db819f9178e9 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,7 @@ Lightning also adds a text column with all the hyperparameters for this experime #### Distributed training +- [Implement Your Own Distributed (DDP) training](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/#init_ddp_connection) - [16-bit mixed precision](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#16-bit-mixed-precision) - [Multi-GPU](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-GPU) - [Multi-node](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-node) diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 13df0539549ee..1c60786cfe2c0 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -42,6 +42,7 @@ But of course the fun is in all the advanced things it can do: **Distributed training** +- [Implement Your Own Distributed (DDP) training](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/#init_ddp_connection) - [16-bit mixed precision](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#16-bit-mixed-precision) - [Multi-GPU](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-GPU) - [Multi-node](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-node) diff --git a/docs/index.md b/docs/index.md index 06a76fec231f8..b48b0d533f046 100644 --- a/docs/index.md +++ b/docs/index.md @@ -99,6 +99,7 @@ Notice a few things about this flow: ###### Distributed training +- [Implement Your Own Distributed (DDP) training](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/#init_ddp_connection) - [16-bit mixed precision](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#16-bit-mixed-precision) - [Multi-GPU](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-GPU) - [Multi-node](https://williamfalcon.github.io/pytorch-lightning/Trainer/Distributed%20training/#Multi-node) From 2f046cd5d11f700866a5a4307b9223f3395f31b7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:35:33 -0500 Subject: [PATCH 57/62] bananas --- pytorch_lightning/root_module/root_module.py | 1 + pytorch_lightning/trainer/ddp_mixin.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 70087d86acfc8..50641c39d6e77 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -14,6 +14,7 @@ from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv import logging from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel +from pytorch_lightning.trainer.ddp_mixin import TrainerDDPMixin class LightningModule(GradInformation, ModelIO, ModelHooks): diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index da0d17d0179ad..4f340c24cca0a 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -140,6 +140,9 @@ def ddp_train(self, gpu_nb, model): if self.logger is not None: self.logger.rank = self.proc_rank + # set model properties before going into wrapper + self.copy_trainer_model_properties(model) + # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table From d0f55fcc8988eee54c2b3cd184e023fc2e715983 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:36:09 -0500 Subject: [PATCH 58/62] bananas --- pytorch_lightning/root_module/root_module.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 50641c39d6e77..70087d86acfc8 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -14,7 +14,6 @@ from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv import logging from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel -from pytorch_lightning.trainer.ddp_mixin import TrainerDDPMixin class LightningModule(GradInformation, ModelIO, ModelHooks): From 2e1a5347013e0afb89c979d1ffc1a20be3039235 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:37:22 -0500 Subject: [PATCH 59/62] bananas --- pytorch_lightning/trainer/ddp_mixin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 4f340c24cca0a..5bc12f348d0c7 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -140,12 +140,10 @@ def ddp_train(self, gpu_nb, model): if self.logger is not None: self.logger.rank = self.proc_rank - # set model properties before going into wrapper - self.copy_trainer_model_properties(model) - # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table + model.trainer = self model.init_ddp_connection() # CHOOSE OPTIMIZER From 6b5d86508332cfe77740ac98f5b829cdbdc3b786 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 3 Nov 2019 06:39:14 -0500 Subject: [PATCH 60/62] bananas --- pytorch_lightning/root_module/root_module.py | 4 ++-- pytorch_lightning/trainer/ddp_mixin.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 70087d86acfc8..3c2f1efd0a4c8 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -117,7 +117,7 @@ def configure_ddp(self, model, device_ids): ) return model - def init_ddp_connection(self): + def init_ddp_connection(self, proc_rank, world_size): """ Connect all procs in the world using the env:// init Use the first node as the root address @@ -150,7 +150,7 @@ def init_ddp_connection(self): root_node = self.trainer.resolve_root_node_address(root_node) os.environ['MASTER_ADDR'] = root_node - dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) + dist.init_process_group('nccl', rank=proc_rank, world_size=world_size) def configure_apex(self, amp, model, optimizers, amp_level): """ diff --git a/pytorch_lightning/trainer/ddp_mixin.py b/pytorch_lightning/trainer/ddp_mixin.py index 5bc12f348d0c7..0653fa58d2651 100644 --- a/pytorch_lightning/trainer/ddp_mixin.py +++ b/pytorch_lightning/trainer/ddp_mixin.py @@ -144,7 +144,7 @@ def ddp_train(self, gpu_nb, model): # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self - model.init_ddp_connection() + model.init_ddp_connection(self.proc_rank, self.world_size) # CHOOSE OPTIMIZER # allow for lr schedulers as well From f3e8ff7e08fa5426822e7abbf68704ec253225fc Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 5 Nov 2019 08:58:49 -0500 Subject: [PATCH 61/62] added eval and train for redundancy --- pytorch_lightning/trainer/train_loop_mixin.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 5d0e977be69a8..29de99cce300c 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -277,15 +277,15 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): if len(self.optimizers) > 1: args.append(opt_idx) + # pass hiddens if using tbptt if self.truncated_bptt_steps is not None: args.append(hiddens) - if self.use_ddp: - output = self.model(*args) - elif self.use_ddp2 or self.use_dp: - # in dp, allow model to use training_step and training_end + # distributed forward + if self.use_ddp or self.use_ddp2 or self.use_dp: output = self.model(*args) + # single GPU forward elif self.single_gpu: gpu_id = 0 if type(self.data_parallel_device_ids) is list: @@ -294,6 +294,7 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): args[0] = batch output = self.model.training_step(*args) + # CPU forward else: output = self.model.training_step(*args) From f1fcdc1c3eb85e361c961fd805a555da34ec0bb2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 5 Nov 2019 08:59:45 -0500 Subject: [PATCH 62/62] added eval and train for redundancy --- pytorch_lightning/trainer/train_loop_mixin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/train_loop_mixin.py b/pytorch_lightning/trainer/train_loop_mixin.py index 29de99cce300c..306416db8d7ed 100644 --- a/pytorch_lightning/trainer/train_loop_mixin.py +++ b/pytorch_lightning/trainer/train_loop_mixin.py @@ -299,7 +299,6 @@ def training_forward(self, batch, batch_nb, opt_idx, hiddens): output = self.model.training_step(*args) # allow any mode to define training_end - # dp and ddp2 need it but optional for all others if self.is_overriden('training_end'): model_ref = self.get_model() output = model_ref.training_end(output)