From 9ae1eb28e1baf036bd95d70d270391693c7b865a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 5 Oct 2019 13:22:23 -0400 Subject: [PATCH 1/3] Fixes #234 --- pytorch_lightning/trainer/trainer.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index cb0e9b858cd3f..1518f241e4c0b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -882,12 +882,25 @@ def __init_tcp_connection(self): :param tries: :return: """ - # sets the appropriate port + + # use slurm job id for the port number + # guarantees unique ports across jobs from same grid search + try: + # use the last 4 numbers in the job id as the id + default_port = os.environ['SLURM_JOB_ID'] + default_port = default_port[-4:] + + # all ports should be in the 10k+ range + default_port = int(default_port) + 10000 + + except Exception as e: + default_port = 12910 + + # if user gave a port number, use that one instead try: port = os.environ['MASTER_PORT'] except Exception: - port = 12910 - os.environ['MASTER_PORT'] = str(port) + os.environ['MASTER_PORT'] = str(default_port) # figure out the root node addr try: From 71353874dcee217a315bc873ee87b3db5ef81771 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 5 Oct 2019 13:28:49 -0400 Subject: [PATCH 2/3] default logger version is now slurm job id --- pytorch_lightning/trainer/trainer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 1518f241e4c0b..3faf8c3f00045 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -178,8 +178,16 @@ def __init__(self, # configure logger self.logger = logger if self.logger is None: + # use SLURM job id as the version number + try: + job_id = os.environ['SLURM_JOB_ID'] + job_id = int(job_id) + except Exception as e: + job_id = None + self.logger = TestTubeLogger( save_dir=self.default_save_path, + version=job_id, name='lightning_logs' ) From 61475b5c972698fc53b39558d54219a21fa4aca7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 5 Oct 2019 13:30:38 -0400 Subject: [PATCH 3/3] default logger version is now slurm job id --- pytorch_lightning/trainer/trainer.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 3faf8c3f00045..093e9d14cd985 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -178,16 +178,9 @@ def __init__(self, # configure logger self.logger = logger if self.logger is None: - # use SLURM job id as the version number - try: - job_id = os.environ['SLURM_JOB_ID'] - job_id = int(job_id) - except Exception as e: - job_id = None - self.logger = TestTubeLogger( save_dir=self.default_save_path, - version=job_id, + version=self.slurm_job_id, name='lightning_logs' ) @@ -248,6 +241,15 @@ def __init__(self, self.amp_level = amp_level self.__init_amp(use_amp) + @property + def slurm_job_id(self): + try: + job_id = os.environ['SLURM_JOB_ID'] + job_id = int(job_id) + except Exception as e: + job_id = None + return job_id + def __configure_weights_path(self, checkpoint_callback, weights_save_path): """ Weight path set in this priority: