From f04ee906e1b1962daf77b944742e3a585f199d2e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 06:39:40 -0400 Subject: [PATCH 01/31] added load on CPU first --- pytorch_lightning/trainer/trainer.py | 26 +++++++++----- pytorch_lightning/trainer/trainer_io.py | 48 +++++++++++++++++-------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 012e36ab31830..eab13f9374cd9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -170,6 +170,7 @@ def __init__(self, # allow int, string and gpu list self.data_parallel_device_ids = self.__parse_gpu_ids(gpus) + self.root_gpu = self.__set_root_gpu(self.data_parallel_device_ids) # distributed backend choice self.use_ddp = False @@ -268,8 +269,20 @@ def __parse_gpu_ids(self, gpus): else: raise Exception('gpus has to be a string, int or list of ints') + return gpus + def __set_root_gpu(self, gpus): + if gpus is None: + return None + + # set root gpu + root_gpu = 0 + if type(gpus) is list: + root_gpu = gpus[0] + + return root_gpu + @property def num_gpus(self): gpus = self.data_parallel_device_ids @@ -701,10 +714,7 @@ def __single_gpu_train(self, model): # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers()) - root_gpu = 0 - if type(self.data_parallel_device_ids) is list: - root_gpu = self.data_parallel_device_ids[0] - model.cuda(root_gpu) + model.cuda(self.root_gpu) if self.use_amp: # An example @@ -721,10 +731,7 @@ def __dp_train(self, model): # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers()) - root_gpu = 0 - if type(self.data_parallel_device_ids) is list: - root_gpu = self.data_parallel_device_ids[0] - model.cuda(root_gpu) + model.cuda(self.root_gpu) # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 @@ -787,6 +794,9 @@ def ddp_train(self, gpu_nb, model): torch.cuda.set_device(gpu_nb) model.cuda(gpu_nb) + # override root GPU + self.root_gpu = gpu_nb + # AMP # run through amp wrapper before going to distributed DP if self.use_amp: diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index f5a869a883c38..05e1d8ce84b7e 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -118,19 +118,22 @@ def save_checkpoint(self, filepath): def restore(self, checkpoint_path, on_gpu): - if on_gpu: - checkpoint = torch.load(checkpoint_path) - else: - checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) - - # load training state (affects trainer only) - self.restore_training_state(checkpoint) + # if on_gpu: + # checkpoint = torch.load(checkpoint_path) + # else: + # load on CPU first + checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) # load model state model = self.__get_model() # load the state_dict on the model automatically model.load_state_dict(checkpoint['state_dict']) + if on_gpu: + model.cuda(self.root_gpu) + + # load training state (affects trainer only) + self.restore_training_state(checkpoint) def dump_checkpoint(self): @@ -210,11 +213,27 @@ def restore_training_state(self, checkpoint): for optimizer, opt_state in zip(self.optimizers, optimizer_states): optimizer.load_state_dict(opt_state) + # move optimizer to GPU 1 weight at a time + # avoids OOM + if self.root_gpu is not None: + for state in optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda(self.root_gpu) + # restore the lr schedulers lr_schedulers = checkpoint['lr_schedulers'] for scheduler, lrs_state in zip(self.lr_schedulers, lr_schedulers): scheduler.load_state_dict(lrs_state) + # move lr scheduler to GPU 1 weight at a time + # avoids OOM + if self.root_gpu is not None: + for state in scheduler.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda(self.root_gpu) + # ---------------------------------- # PRIVATE OPS # ---------------------------------- @@ -248,13 +267,8 @@ def hpc_save(self, folderpath, experiment): def hpc_load(self, folderpath, on_gpu): filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) - if on_gpu: - checkpoint = torch.load(filepath) - else: - checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) - - # load training state (affects trainer only) - self.restore_training_state(checkpoint) + # load on GPU first + checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) # load model state model = self.__get_model() @@ -262,6 +276,12 @@ def hpc_load(self, folderpath, on_gpu): # load the state_dict on the model automatically model.load_state_dict(checkpoint['state_dict']) + if self.root_gpu is not None: + model.cuda(self.root_gpu) + + # load training state (affects trainer only) + self.restore_training_state(checkpoint) + # call model hook model.on_hpc_load(checkpoint) From 376598673f9b970143d19430074a1178700ac6ab Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 06:47:37 -0400 Subject: [PATCH 02/31] added load on CPU first --- pytorch_lightning/trainer/trainer_io.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index 05e1d8ce84b7e..99b1f4e450baf 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -226,14 +226,6 @@ def restore_training_state(self, checkpoint): for scheduler, lrs_state in zip(self.lr_schedulers, lr_schedulers): scheduler.load_state_dict(lrs_state) - # move lr scheduler to GPU 1 weight at a time - # avoids OOM - if self.root_gpu is not None: - for state in scheduler.state.values(): - for k, v in state.items(): - if isinstance(v, torch.Tensor): - state[k] = v.cuda(self.root_gpu) - # ---------------------------------- # PRIVATE OPS # ---------------------------------- From 76a39f6b07be0d4333dcc800c0b5808be245ce7c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:10:47 -0400 Subject: [PATCH 03/31] added load on CPU first --- pytorch_lightning/trainer/trainer_io.py | 2 +- tests/test_models.py | 83 +++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index 99b1f4e450baf..7793406723f6e 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -259,7 +259,7 @@ def hpc_save(self, folderpath, experiment): def hpc_load(self, folderpath, on_gpu): filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) - # load on GPU first + # load on CPU first checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) # load model state diff --git a/tests/test_models.py b/tests/test_models.py index d0214ea321694..c35eba37dcbac 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,6 +39,89 @@ # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ +def test_amp_ddp_resume(): + """ + Make sure DDP + AMP continue training correctly + :return: + """ + if not can_run_gpu_test(): + return + + # simulate setting slurm flags + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) + os.environ['SLURM_LOCALID'] = str(0) + + hparams = get_hparams() + model = LightningTestModel(hparams) + + trainer_options = dict( + show_progress_bar=True, + max_nb_epochs=1, + gpus=2, + distributed_backend='ddp', + use_amp=True + ) + + save_dir = init_save_dir() + + # exp file to get meta + exp = get_exp(False) + exp.argparse(hparams) + exp.save() + + # exp file to get weights + checkpoint = ModelCheckpoint(save_dir) + + # add these to the trainer options + trainer_options['experiment'] = exp + + # fit model + trainer = Trainer(**trainer_options) + trainer.is_slurm_managing_tasks = True + result = trainer.fit(model) + + # track epoch before saving + real_global_epoch = trainer.current_epoch + + # correct result and ok accuracy + assert result == 1, 'amp + ddp model failed to complete' + + # --------------------------- + # HPC LOAD/SAVE + # --------------------------- + # save + trainer.hpc_save(save_dir, exp) + + # init new trainer + new_exp = get_exp(False, version=exp.version) + trainer_options['experiment'] = new_exp + trainer_options['train_percent_check'] = 0.2 + trainer_options['val_percent_check'] = 0.2 + new_trainer = Trainer(**trainer_options) + + # set the epoch start hook so we can predict before the model does the full training + def assert_good_acc(): + assert trainer.current_epoch == real_global_epoch and trainer.current_epoch > 0 + + # if model and state loaded correctly, predictions will be good even though we + # haven't trained with the new loaded model + new_trainer.model.eval() + _ = [run_prediction(dataloader, trainer.model) for dataloader in trainer.val_dataloader] + + # new model + model = LightningTestModel(hparams) + model.on_sanity_check_start = assert_good_acc + + # fit new model which should load hpc weights + new_trainer.fit(model) + + # test freeze on gpu + model.freeze() + model.unfreeze() + + clear_save_dir() + + def test_running_test_pretrained_model_ddp(): """Verify test() on pretrained model""" if not can_run_gpu_test(): From 1dbc70033ea650e6d56358619328618d117cba89 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:14:07 -0400 Subject: [PATCH 04/31] added load on CPU first --- tests/test_models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_models.py b/tests/test_models.py index c35eba37dcbac..1a2acfdffdbba 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -92,6 +92,9 @@ def test_amp_ddp_resume(): # save trainer.hpc_save(save_dir, exp) + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) + os.environ['SLURM_LOCALID'] = str(0) + # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp From ab3c97e2f7479c3d17dcbc09a38a3e144f0003a7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:26:47 -0400 Subject: [PATCH 05/31] added load on CPU first --- tests/test_models.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 1a2acfdffdbba..03cc004f5b6ec 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,7 +39,7 @@ # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ -def test_amp_ddp_resume(): +def test_amp_dp_resume(): """ Make sure DDP + AMP continue training correctly :return: @@ -47,10 +47,6 @@ def test_amp_ddp_resume(): if not can_run_gpu_test(): return - # simulate setting slurm flags - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - os.environ['SLURM_LOCALID'] = str(0) - hparams = get_hparams() model = LightningTestModel(hparams) @@ -58,7 +54,7 @@ def test_amp_ddp_resume(): show_progress_bar=True, max_nb_epochs=1, gpus=2, - distributed_backend='ddp', + distributed_backend='dp', use_amp=True ) @@ -84,7 +80,7 @@ def test_amp_ddp_resume(): real_global_epoch = trainer.current_epoch # correct result and ok accuracy - assert result == 1, 'amp + ddp model failed to complete' + assert result == 1, 'amp + dp model failed to complete' # --------------------------- # HPC LOAD/SAVE @@ -92,9 +88,6 @@ def test_amp_ddp_resume(): # save trainer.hpc_save(save_dir, exp) - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - os.environ['SLURM_LOCALID'] = str(0) - # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp From f3bd4ef6207971659b9534820df0c0aeda8abd4a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:29:24 -0400 Subject: [PATCH 06/31] added load on CPU first --- tests/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 03cc004f5b6ec..2b8c2e7943982 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -55,7 +55,6 @@ def test_amp_dp_resume(): max_nb_epochs=1, gpus=2, distributed_backend='dp', - use_amp=True ) save_dir = init_save_dir() From deaf8336d79d61a7a3a52a5e2fa3602732322350 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:31:46 -0400 Subject: [PATCH 07/31] added load on CPU first --- pytorch_lightning/trainer/trainer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index eab13f9374cd9..fba5a27667491 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -743,7 +743,12 @@ def __dp_train(self, model): """ raise MisconfigurationException(m) - model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids) + # create list of device ids + device_ids = self.data_parallel_device_ids + if type(device_ids) is int: + device_ids = list(range(device_ids)) + + model = LightningDataParallel(model, device_ids=device_ids) self.__run_pretrain_routine(model) From 6e28f49004f057a32c5bb9f7f8e39918954500bd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:33:41 -0400 Subject: [PATCH 08/31] added load on CPU first --- tests/test_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 2b8c2e7943982..49a9a5f39e505 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -52,7 +52,7 @@ def test_amp_dp_resume(): trainer_options = dict( show_progress_bar=True, - max_nb_epochs=1, + max_nb_epochs=2, gpus=2, distributed_backend='dp', ) @@ -92,6 +92,7 @@ def test_amp_dp_resume(): trainer_options['experiment'] = new_exp trainer_options['train_percent_check'] = 0.2 trainer_options['val_percent_check'] = 0.2 + trainer_options['max_nb_epochs'] = 1 new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training From 93a601d1e841d803cc4792b2894bfa79698dbf5c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:41:00 -0400 Subject: [PATCH 09/31] added load on CPU first --- tests/test_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 49a9a5f39e505..baac0e12ef85e 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -101,8 +101,9 @@ def assert_good_acc(): # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model - new_trainer.model.eval() - _ = [run_prediction(dataloader, trainer.model) for dataloader in trainer.val_dataloader] + dp_model = new_trainer.model.module + dp_model.eval() + _ = [run_prediction(dataloader, dp_model) for dataloader in trainer.val_dataloader] # new model model = LightningTestModel(hparams) From d6b9ebedad38893c36b5c573f27e953c06eb40f8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:45:50 -0400 Subject: [PATCH 10/31] added load on CPU first --- tests/test_models.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index baac0e12ef85e..db73048d9dee4 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -101,9 +101,10 @@ def assert_good_acc(): # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model - dp_model = new_trainer.model.module + dp_model = new_trainer.model dp_model.eval() - _ = [run_prediction(dataloader, dp_model) for dataloader in trainer.val_dataloader] + + _ = [run_prediction(dataloader, dp_model, dp=True) for dataloader in trainer.val_dataloader] # new model model = LightningTestModel(hparams) @@ -1422,7 +1423,7 @@ def load_model(exp, save_dir, on_gpu, map_location=None, module_class=LightningT return trained_model -def run_prediction(dataloader, trained_model): +def run_prediction(dataloader, trained_model, dp=False): # run prediction on 1 batch for batch in dataloader: break @@ -1430,7 +1431,10 @@ def run_prediction(dataloader, trained_model): x, y = batch x = x.view(x.size(0), -1) - y_hat = trained_model(x) + if dp: + y_hat = trained_model(x, 0) + else: + y_hat = trained_model(x) # acc labels_hat = torch.argmax(y_hat, dim=1) From 4ba419abfac0c778afb98826f1be3a2f13bdf20a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:50:08 -0400 Subject: [PATCH 11/31] added load on CPU first --- tests/test_models.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index db73048d9dee4..2fc7890f4d103 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1432,15 +1432,17 @@ def run_prediction(dataloader, trained_model, dp=False): x = x.view(x.size(0), -1) if dp: - y_hat = trained_model(x, 0) + output = trained_model(batch, 0) + acc = output['val_acc'] + else: y_hat = trained_model(x) - # acc - labels_hat = torch.argmax(y_hat, dim=1) - acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - acc = torch.tensor(acc) - acc = acc.item() + # acc + labels_hat = torch.argmax(y_hat, dim=1) + acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) + acc = torch.tensor(acc) + acc = acc.item() assert acc > 0.50, f'this model is expected to get > 0.50 in test set (it got {acc})' From 587f4d70991d8a62ae58c1e710cabafdd496a593 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:53:33 -0400 Subject: [PATCH 12/31] added load on CPU first --- tests/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_models.py b/tests/test_models.py index 2fc7890f4d103..54165d238c66e 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1434,6 +1434,7 @@ def run_prediction(dataloader, trained_model, dp=False): if dp: output = trained_model(batch, 0) acc = output['val_acc'] + acc = torch.mean(acc).item() else: y_hat = trained_model(x) From 90971e92617511b1ee5514c3f97655b5c7c7a4a4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 07:56:56 -0400 Subject: [PATCH 13/31] added load on CPU first --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 54165d238c66e..d8b4f977522db 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -52,7 +52,7 @@ def test_amp_dp_resume(): trainer_options = dict( show_progress_bar=True, - max_nb_epochs=2, + max_nb_epochs=4, gpus=2, distributed_backend='dp', ) From 04b2d3b038f6d3ad3dee58e2113bab2949d81f17 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:15:46 -0400 Subject: [PATCH 14/31] added load on CPU first --- tests/debug.py | 74 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/tests/debug.py b/tests/debug.py index 5efa25da628f1..4a1a26a73701f 100644 --- a/tests/debug.py +++ b/tests/debug.py @@ -214,10 +214,23 @@ def get_hparams(continue_training=False, hpc_exp_number=0): def main(): - """Verify test() on fitted model""" + """ + Make sure DDP + AMP continue training correctly + :return: + """ + if not can_run_gpu_test(): + return + hparams = get_hparams() model = LightningTestModel(hparams) + trainer_options = dict( + show_progress_bar=True, + max_nb_epochs=4, + gpus=2, + distributed_backend='dp', + ) + save_dir = init_save_dir() # exp file to get meta @@ -228,31 +241,58 @@ def main(): # exp file to get weights checkpoint = ModelCheckpoint(save_dir) - trainer_options = dict( - show_progress_bar=False, - max_nb_epochs=1, - train_percent_check=0.4, - val_percent_check=0.2, - checkpoint_callback=checkpoint, - experiment=exp, - gpus=[0, 1], - distributed_backend='ddp' - ) + # add these to the trainer options + trainer_options['experiment'] = exp # fit model trainer = Trainer(**trainer_options) + trainer.is_slurm_managing_tasks = True result = trainer.fit(model) + # track epoch before saving + real_global_epoch = trainer.current_epoch + # correct result and ok accuracy - assert result == 1, 'training failed to complete' - pretrained_model = load_model(exp, save_dir, on_gpu=True, module_class=LightningTestModel) + assert result == 1, 'amp + dp model failed to complete' + # --------------------------- + # HPC LOAD/SAVE + # --------------------------- + # save + trainer.hpc_save(save_dir, exp) + + # init new trainer + new_exp = get_exp(False, version=exp.version) + trainer_options['experiment'] = new_exp + trainer_options['train_percent_check'] = 0.2 + trainer_options['val_percent_check'] = 0.2 + trainer_options['max_nb_epochs'] = 1 new_trainer = Trainer(**trainer_options) - new_trainer.test(pretrained_model) - # test we have good test accuracy - assert_ok_test_acc(new_trainer) - # clear_save_dir() + # set the epoch start hook so we can predict before the model does the full training + def assert_good_acc(): + assert trainer.current_epoch == real_global_epoch and trainer.current_epoch > 0 + + # if model and state loaded correctly, predictions will be good even though we + # haven't trained with the new loaded model + dp_model = new_trainer.model + dp_model.eval() + + _ = [run_prediction(dataloader, dp_model, dp=True) for dataloader in trainer.val_dataloader] + + # new model + model = LightningTestModel(hparams) + model.on_sanity_check_start = assert_good_acc + + # fit new model which should load hpc weights + pdb.set_trace() + new_trainer.fit(model) + + # test freeze on gpu + model.freeze() + model.unfreeze() + + clear_save_dir() if __name__ == '__main__': From 5f98e5873a6d6358ee799953302cd2abf893fe97 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:16:16 -0400 Subject: [PATCH 15/31] added load on CPU first --- tests/debug.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/debug.py b/tests/debug.py index 4a1a26a73701f..ebb27c10da110 100644 --- a/tests/debug.py +++ b/tests/debug.py @@ -218,9 +218,6 @@ def main(): Make sure DDP + AMP continue training correctly :return: """ - if not can_run_gpu_test(): - return - hparams = get_hparams() model = LightningTestModel(hparams) From f341014e1dca8b03705f15419fa6dfc851b3e34c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:21:40 -0400 Subject: [PATCH 16/31] added load on CPU first --- pytorch_lightning/trainer/trainer_io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index 7793406723f6e..b36c2a2404b57 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -1,6 +1,7 @@ import os import re import signal +import pdb from subprocess import call import torch @@ -185,8 +186,10 @@ def restore_hpc_weights_if_needed(self, model): folderpath = self.weights_save_path if os.path.exists(folderpath): files = os.listdir(folderpath) + pdb.set_trace() hpc_weight_paths = [x for x in files if 'hpc_ckpt' in x] + # if hpc weights exist restore model if len(hpc_weight_paths) > 0: self.hpc_load(folderpath, self.on_gpu) From b0ea8112222ed5fb1cd45f66edc0d062cc0e3df4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:24:55 -0400 Subject: [PATCH 17/31] added load on CPU first --- tests/debug.py | 4 +++- tests/test_models.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/debug.py b/tests/debug.py index ebb27c10da110..52241aa096beb 100644 --- a/tests/debug.py +++ b/tests/debug.py @@ -240,6 +240,7 @@ def main(): # add these to the trainer options trainer_options['experiment'] = exp + trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) @@ -258,9 +259,11 @@ def main(): # save trainer.hpc_save(save_dir, exp) + # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp + trainer_options['checkpoint_callback'] = ModelCheckpoint(save_dir) trainer_options['train_percent_check'] = 0.2 trainer_options['val_percent_check'] = 0.2 trainer_options['max_nb_epochs'] = 1 @@ -282,7 +285,6 @@ def assert_good_acc(): model.on_sanity_check_start = assert_good_acc # fit new model which should load hpc weights - pdb.set_trace() new_trainer.fit(model) # test freeze on gpu diff --git a/tests/test_models.py b/tests/test_models.py index d8b4f977522db..31ac5fc9211b3 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -69,6 +69,7 @@ def test_amp_dp_resume(): # add these to the trainer options trainer_options['experiment'] = exp + trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) @@ -87,9 +88,11 @@ def test_amp_dp_resume(): # save trainer.hpc_save(save_dir, exp) + # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp + trainer_options['checkpoint_callback'] = ModelCheckpoint(save_dir) trainer_options['train_percent_check'] = 0.2 trainer_options['val_percent_check'] = 0.2 trainer_options['max_nb_epochs'] = 1 From addef006b0cd116626e0f01c104f0d5172fc6d55 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:29:29 -0400 Subject: [PATCH 18/31] added load on CPU first --- pytorch_lightning/trainer/trainer_io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index b36c2a2404b57..b697e6730acb4 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -186,7 +186,6 @@ def restore_hpc_weights_if_needed(self, model): folderpath = self.weights_save_path if os.path.exists(folderpath): files = os.listdir(folderpath) - pdb.set_trace() hpc_weight_paths = [x for x in files if 'hpc_ckpt' in x] From 390295aa80d3717b030aa0ac90eccfdab17e7453 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:31:14 -0400 Subject: [PATCH 19/31] added load on CPU first --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 31ac5fc9211b3..6bcb0041690ef 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -52,7 +52,7 @@ def test_amp_dp_resume(): trainer_options = dict( show_progress_bar=True, - max_nb_epochs=4, + max_nb_epochs=1, gpus=2, distributed_backend='dp', ) From 0f8f266fd5733b2d55ab50c477a26d88d4892f91 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:32:25 -0400 Subject: [PATCH 20/31] added load on CPU first --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 6bcb0041690ef..5fc5c1076a093 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -52,7 +52,7 @@ def test_amp_dp_resume(): trainer_options = dict( show_progress_bar=True, - max_nb_epochs=1, + max_nb_epochs=2, gpus=2, distributed_backend='dp', ) From 3d5c6d7f11d63774fab9eb7f359ecde513317f85 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:34:23 -0400 Subject: [PATCH 21/31] added load on CPU first --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 5fc5c1076a093..2eecceb41fa0d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -100,7 +100,7 @@ def test_amp_dp_resume(): # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): - assert trainer.current_epoch == real_global_epoch and trainer.current_epoch > 0 + assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model From a5d842a309e71a6033ad1fa6e863d58d6f201323 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:40:16 -0400 Subject: [PATCH 22/31] added load on CPU first --- tests/test_models.py | 89 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 2eecceb41fa0d..e81f57b70c870 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,7 +39,7 @@ # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ -def test_amp_dp_resume(): +def test_dp_resume(): """ Make sure DDP + AMP continue training correctly :return: @@ -88,6 +88,93 @@ def test_amp_dp_resume(): # save trainer.hpc_save(save_dir, exp) + # init new trainer + new_exp = get_exp(False, version=exp.version) + trainer_options['experiment'] = new_exp + trainer_options['checkpoint_callback'] = ModelCheckpoint(save_dir) + trainer_options['train_percent_check'] = 0.2 + trainer_options['val_percent_check'] = 0.2 + trainer_options['max_nb_epochs'] = 1 + new_trainer = Trainer(**trainer_options) + + # set the epoch start hook so we can predict before the model does the full training + def assert_good_acc(): + assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0 + + # if model and state loaded correctly, predictions will be good even though we + # haven't trained with the new loaded model + dp_model = new_trainer.model + dp_model.eval() + + _ = [run_prediction(dataloader, dp_model, dp=True) for dataloader in trainer.val_dataloader] + + # new model + model = LightningTestModel(hparams) + model.on_sanity_check_start = assert_good_acc + + # fit new model which should load hpc weights + new_trainer.fit(model) + + # test freeze on gpu + model.freeze() + model.unfreeze() + + clear_save_dir() + + +def test_amp_ddp_resume(): + """ + Make sure DDP + AMP continue training correctly + :return: + """ + if not can_run_gpu_test(): + return + + hparams = get_hparams() + model = LightningTestModel(hparams) + + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) + + trainer_options = dict( + show_progress_bar=True, + max_nb_epochs=2, + gpus=2, + use_amp=True, + distributed_backend='ddp', + ) + + save_dir = init_save_dir() + + # exp file to get meta + exp = get_exp(False) + exp.argparse(hparams) + exp.save() + + # exp file to get weights + checkpoint = ModelCheckpoint(save_dir) + + # add these to the trainer options + trainer_options['experiment'] = exp + trainer_options['checkpoint_callback'] = checkpoint + + # fit model + trainer = Trainer(**trainer_options) + trainer.is_slurm_managing_tasks = True + result = trainer.fit(model) + + # track epoch before saving + real_global_epoch = trainer.current_epoch + + # correct result and ok accuracy + assert result == 1, 'amp + dp model failed to complete' + + # --------------------------- + # HPC LOAD/SAVE + # --------------------------- + # save + trainer.hpc_save(save_dir, exp) + + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) # init new trainer new_exp = get_exp(False, version=exp.version) From afa42d26eb850b3dcf81afd10ca52706b74567f3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:41:02 -0400 Subject: [PATCH 23/31] added load on CPU first --- tests/test_models.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index e81f57b70c870..208614d882c62 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,7 +39,7 @@ # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ -def test_dp_resume(): +def test_amp_ddp_resume(): """ Make sure DDP + AMP continue training correctly :return: @@ -50,11 +50,14 @@ def test_dp_resume(): hparams = get_hparams() model = LightningTestModel(hparams) + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) + trainer_options = dict( show_progress_bar=True, max_nb_epochs=2, gpus=2, - distributed_backend='dp', + use_amp=True, + distributed_backend='ddp', ) save_dir = init_save_dir() @@ -88,6 +91,8 @@ def test_dp_resume(): # save trainer.hpc_save(save_dir, exp) + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) + # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp @@ -122,7 +127,7 @@ def assert_good_acc(): clear_save_dir() -def test_amp_ddp_resume(): +def test_dp_resume(): """ Make sure DDP + AMP continue training correctly :return: @@ -133,14 +138,11 @@ def test_amp_ddp_resume(): hparams = get_hparams() model = LightningTestModel(hparams) - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - trainer_options = dict( show_progress_bar=True, max_nb_epochs=2, gpus=2, - use_amp=True, - distributed_backend='ddp', + distributed_backend='dp', ) save_dir = init_save_dir() @@ -174,8 +176,6 @@ def test_amp_ddp_resume(): # save trainer.hpc_save(save_dir, exp) - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp From 3b2a39b4a425ee11da423f35fbec42e974ece6b6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 08:46:28 -0400 Subject: [PATCH 24/31] added load on CPU first --- tests/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_models.py b/tests/test_models.py index 208614d882c62..84cbc87bb2f99 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -101,6 +101,7 @@ def test_amp_ddp_resume(): trainer_options['val_percent_check'] = 0.2 trainer_options['max_nb_epochs'] = 1 new_trainer = Trainer(**trainer_options) + new_trainer.is_slurm_managing_tasks = True # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): From 059247d5c908a00cef0e8077836d9f3434f8a376 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 09:16:26 -0400 Subject: [PATCH 25/31] added load on CPU first --- tests/test_models.py | 89 -------------------------------------------- 1 file changed, 89 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 84cbc87bb2f99..9822e83e7baf9 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,95 +39,6 @@ # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ -def test_amp_ddp_resume(): - """ - Make sure DDP + AMP continue training correctly - :return: - """ - if not can_run_gpu_test(): - return - - hparams = get_hparams() - model = LightningTestModel(hparams) - - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - - trainer_options = dict( - show_progress_bar=True, - max_nb_epochs=2, - gpus=2, - use_amp=True, - distributed_backend='ddp', - ) - - save_dir = init_save_dir() - - # exp file to get meta - exp = get_exp(False) - exp.argparse(hparams) - exp.save() - - # exp file to get weights - checkpoint = ModelCheckpoint(save_dir) - - # add these to the trainer options - trainer_options['experiment'] = exp - trainer_options['checkpoint_callback'] = checkpoint - - # fit model - trainer = Trainer(**trainer_options) - trainer.is_slurm_managing_tasks = True - result = trainer.fit(model) - - # track epoch before saving - real_global_epoch = trainer.current_epoch - - # correct result and ok accuracy - assert result == 1, 'amp + dp model failed to complete' - - # --------------------------- - # HPC LOAD/SAVE - # --------------------------- - # save - trainer.hpc_save(save_dir, exp) - - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - - # init new trainer - new_exp = get_exp(False, version=exp.version) - trainer_options['experiment'] = new_exp - trainer_options['checkpoint_callback'] = ModelCheckpoint(save_dir) - trainer_options['train_percent_check'] = 0.2 - trainer_options['val_percent_check'] = 0.2 - trainer_options['max_nb_epochs'] = 1 - new_trainer = Trainer(**trainer_options) - new_trainer.is_slurm_managing_tasks = True - - # set the epoch start hook so we can predict before the model does the full training - def assert_good_acc(): - assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0 - - # if model and state loaded correctly, predictions will be good even though we - # haven't trained with the new loaded model - dp_model = new_trainer.model - dp_model.eval() - - _ = [run_prediction(dataloader, dp_model, dp=True) for dataloader in trainer.val_dataloader] - - # new model - model = LightningTestModel(hparams) - model.on_sanity_check_start = assert_good_acc - - # fit new model which should load hpc weights - new_trainer.fit(model) - - # test freeze on gpu - model.freeze() - model.unfreeze() - - clear_save_dir() - - def test_dp_resume(): """ Make sure DDP + AMP continue training correctly From b73940b64e8d7a23b5a14a98f6047a0b4e3b168d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 09:37:19 -0400 Subject: [PATCH 26/31] added load on CPU first --- pytorch_lightning/trainer/trainer.py | 1 - pytorch_lightning/trainer/trainer_io.py | 1 - tests/debug.py | 1 - 3 files changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index fba5a27667491..7d207502f9112 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -269,7 +269,6 @@ def __parse_gpu_ids(self, gpus): else: raise Exception('gpus has to be a string, int or list of ints') - return gpus def __set_root_gpu(self, gpus): diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index b697e6730acb4..dc5753d7cf7e9 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -188,7 +188,6 @@ def restore_hpc_weights_if_needed(self, model): files = os.listdir(folderpath) hpc_weight_paths = [x for x in files if 'hpc_ckpt' in x] - # if hpc weights exist restore model if len(hpc_weight_paths) > 0: self.hpc_load(folderpath, self.on_gpu) diff --git a/tests/debug.py b/tests/debug.py index 52241aa096beb..6016dee3ba323 100644 --- a/tests/debug.py +++ b/tests/debug.py @@ -259,7 +259,6 @@ def main(): # save trainer.hpc_save(save_dir, exp) - # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp From 5c41ed44e9878044bb8bd982bfae76290ba73e9c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 09:37:52 -0400 Subject: [PATCH 27/31] added load on CPU first --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 9822e83e7baf9..732aa5b95c186 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -41,7 +41,7 @@ # ------------------------------------------------------------------------ def test_dp_resume(): """ - Make sure DDP + AMP continue training correctly + Make sure DP continues training correctly :return: """ if not can_run_gpu_test(): From 2d31ac1343d4cd8e2c8ae0189428864c1765dc4d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 14:47:53 -0400 Subject: [PATCH 28/31] added print logs --- pytorch_lightning/trainer/trainer_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index dc5753d7cf7e9..475cc6ac1b5cb 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -262,6 +262,7 @@ def hpc_load(self, folderpath, on_gpu): # load on CPU first checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) + print(f'restored hpc model from: {filepath}') # load model state model = self.__get_model() From 653a88bafa27a045f037cbb3347adce4428cbbb3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 14:48:06 -0400 Subject: [PATCH 29/31] added print logs --- pytorch_lightning/trainer/trainer_io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index 475cc6ac1b5cb..149634743b77b 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -262,7 +262,6 @@ def hpc_load(self, folderpath, on_gpu): # load on CPU first checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) - print(f'restored hpc model from: {filepath}') # load model state model = self.__get_model() @@ -279,6 +278,8 @@ def hpc_load(self, folderpath, on_gpu): # call model hook model.on_hpc_load(checkpoint) + print(f'restored hpc model from: {filepath}') + def max_ckpt_in_folder(self, path, name_key='ckpt_'): files = os.listdir(path) files = [x for x in files if name_key in x] From 7bbea9c9805c3ee818dcef9d38f0c63ed9ed1a3d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 14:51:30 -0400 Subject: [PATCH 30/31] changed close order --- pytorch_lightning/trainer/trainer_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index 149634743b77b..51131d32ed096 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -104,6 +104,9 @@ def sig_handler(self, signum, frame): else: print('requeue failed...') + # close experiment to avoid issues + self.experiment.close() + def term_handler(self, signum, frame): # save print("bypassing sigterm") @@ -237,9 +240,6 @@ def hpc_save(self, folderpath, experiment): # save exp to make sure we get all the metrics experiment.save() - # close experiment to avoid issues - experiment.close() - ckpt_number = self.max_ckpt_in_folder(folderpath) + 1 if not os.path.exists(folderpath): From 08da62e89334bbd52545456c97495ee1139b2140 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 10 Sep 2019 16:26:22 -0400 Subject: [PATCH 31/31] changed close order --- pytorch_lightning/trainer/trainer_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer_io.py b/pytorch_lightning/trainer/trainer_io.py index 51131d32ed096..71b677cc417e4 100644 --- a/pytorch_lightning/trainer/trainer_io.py +++ b/pytorch_lightning/trainer/trainer_io.py @@ -79,7 +79,7 @@ def register_slurm_signal_handlers(self): except Exception as e: pass - if on_slurm and self.proc_rank == 0: + if on_slurm: print('set slurm handle signals') signal.signal(signal.SIGUSR1, self.sig_handler) signal.signal(signal.SIGTERM, self.term_handler)