From fbff8b6beec0cb01d35fee3c0d79070267e96e33 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 25 Nov 2020 22:48:48 +0100 Subject: [PATCH 1/8] refactor imports of optional dependencies --- pytorch_lightning/__init__.py | 7 ------- .../accelerators/accelerator_connector.py | 8 ++------ pytorch_lightning/accelerators/horovod_accelerator.py | 8 ++------ pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/trainer/data_loading.py | 10 +++------- pytorch_lightning/utilities/__init__.py | 1 + pytorch_lightning/utilities/debugging.py | 2 +- 7 files changed, 10 insertions(+), 28 deletions(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 06cb7a60f0473..f2e9206dd9d13 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -68,12 +68,5 @@ 'metrics', ] - # necessary for regular bolts imports. Skip exception since bolts is not always installed - try: - from pytorch_lightning import bolts - except ImportError: - pass - # __call__ = __all__ - # for compatibility with namespace packages __import__('pkg_resources').declare_namespace(__name__) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index c40600785a558..c6c3be1a06109 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -15,7 +15,7 @@ import os import torch -from pytorch_lightning.utilities import device_parser, XLA_AVAILABLE +from pytorch_lightning.utilities import device_parser, XLA_AVAILABLE, HOROVOD_AVAILABLE from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -24,12 +24,8 @@ from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.accelerators.accelerator import Accelerator -try: +if HOROVOD_AVAILABLE: import horovod.torch as hvd -except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True class AcceleratorConnector: diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 3d9191914566d..8c3c4adf4e0df 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -18,15 +18,11 @@ from torch.optim.lr_scheduler import _LRScheduler from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities import AMPType, HOROVOD_AVAILABLE from pytorch_lightning.utilities.distributed import rank_zero_only -try: +if HOROVOD_AVAILABLE: import horovod.torch as hvd -except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True class HorovodAccelerator(Accelerator): diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index e888c6f4a87df..96bdf9a7be188 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1825,7 +1825,7 @@ def __get_hparams_assignment_variable(self): line = re.sub(r"\s+", "", line, flags=re.UNICODE) if ".hparams=" in line: return line.split("=")[1] - except Exception as e: + except Exception: return "hparams" return None diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index a15e9bba2af63..4a7b14d0b1fe9 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -16,14 +16,14 @@ import platform from abc import ABC from copy import deepcopy -from typing import Callable, Iterable, List, Optional, Tuple, Union +from typing import Union, List, Tuple, Callable, Optional, Iterable from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.core import LightningModule -from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn, TPU_AVAILABLE, HOROVOD_AVAILABLE from pytorch_lightning.utilities.data import has_iterable_dataset, has_len from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -32,12 +32,8 @@ if TPU_AVAILABLE: import torch_xla.core.xla_model as xm -try: +if HOROVOD_AVAILABLE: import horovod.torch as hvd -except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True class TrainerDataLoadingMixin(ABC): diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 916e434e5ff06..1e2eeea9f456c 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -47,6 +47,7 @@ def _module_available(module_path: str) -> bool: NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast") OMEGACONF_AVAILABLE = _module_available("omegaconf") HYDRA_AVAILABLE = _module_available("hydra") +HOROVOD_AVAILABLE = _module_available("horovod.torch") TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists() FAIRSCALE_AVAILABLE = platform.system() != 'Windows' and _module_available('fairscale.nn.data_parallel') diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index f7b9e79b7f932..9264e2a49810d 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -87,7 +87,7 @@ def track_load_dataloader_call(self, name, dataloaders): for dl in dataloaders: try: length = len(dl) - except Exception as e: + except Exception: length = -1 lengths.append(length) From 3a3b83491f78370af45267a2abfffd6132257dc2 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 26 Nov 2020 15:04:40 +0100 Subject: [PATCH 2/8] fix --- tests/models/data/horovod/train_default_model.py | 5 +++-- tests/models/test_horovod.py | 9 ++------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 5b31c678177e4..a40fa9db545ad 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -21,10 +21,11 @@ import os import sys +from pytorch_lightning.utilities import HOROVOD_AVAILABLE -try: +if HOROVOD_AVAILABLE: import horovod.torch as hvd -except ImportError: +else: print('You requested to import Horovod which is missing or not supported for your OS.') PATH_HERE = os.path.abspath(os.path.dirname(__file__)) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index ace1577514b7d..9af3002fb2571 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -28,19 +28,14 @@ import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator -from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult from pytorch_lightning.metrics.classification.accuracy import Accuracy -from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE +from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE from tests.base import EvalModelTemplate from tests.base.models import BasicGAN -try: +if HOROVOD_AVAILABLE: import horovod from horovod.common.util import nccl_built -except ImportError: - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True # This script will run the actual test model training in parallel From a64e57492adc84c3cebb32f2fc01293a22d4f83f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 26 Nov 2020 15:32:19 +0100 Subject: [PATCH 3/8] fix --- tests/models/test_horovod.py | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 9af3002fb2571..8c77c2161c12d 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -29,31 +29,20 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator from pytorch_lightning.metrics.classification.accuracy import Accuracy -from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE +from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE, _module_available from tests.base import EvalModelTemplate +from tests.base.boring_model import BoringModel from tests.base.models import BasicGAN if HOROVOD_AVAILABLE: import horovod - from horovod.common.util import nccl_built - + import horovod.torch as hvd +HOROVOD_NCCL_AVAILABLE = _module_available("horovod.common.util.nccl_built") # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py') -def _nccl_available(): - if not HOROVOD_AVAILABLE: - return False - - try: - return nccl_built() - except AttributeError: - # Horovod 0.19.1 nccl_built() does not yet work with Python 3.8: - # See: https://github.com/horovod/horovod/issues/1891 - return False - - def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" num_processes = trainer_options.get('gpus', 2) @@ -106,7 +95,7 @@ def test_horovod_cpu_implicit(tmpdir): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_horovod_multi_gpu(tmpdir): """Test Horovod with multi-GPU support.""" @@ -126,7 +115,7 @@ def test_horovod_multi_gpu(tmpdir): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex") def test_horovod_apex(tmpdir): @@ -150,7 +139,7 @@ def test_horovod_apex(tmpdir): @pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="test requires torch.cuda.amp") def test_horovod_amp(tmpdir): @@ -173,7 +162,7 @@ def test_horovod_amp(tmpdir): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support") @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_horovod_transfer_batch_to_gpu(tmpdir): class TestTrainingStepModel(EvalModelTemplate): @@ -252,10 +241,6 @@ def hvd_test_fn(): path_root = os.path.abspath(os.path.join(path_here, '..', '..')) sys.path.insert(0, os.path.abspath(path_root)) - from tests.base.boring_model import BoringModel - - import horovod.torch as hvd - class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True @@ -306,8 +291,6 @@ def sk_metric(preds, target): target = torch.randint(high=2, size=(num_batches, batch_size)) def _compute_batch(): - import horovod.torch as hvd - trainer = Trainer( fast_dev_run=True, distributed_backend='horovod', From df159d632a6e4dfca04075e58714703b5d2b408a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 26 Nov 2020 16:25:30 +0100 Subject: [PATCH 4/8] fix --- tests/models/test_horovod.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 8c77c2161c12d..3569683485192 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -38,7 +38,7 @@ import horovod import horovod.torch as hvd -HOROVOD_NCCL_AVAILABLE = _module_available("horovod.common.util.nccl_built") +HOROVOD_NCCL_AVAILABLE = _module_available("horovod.common.util") and hasattr(horovod.common.util, "nccl_built") # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py') From fbb540ba79a9b25caee995207d0ef5b7700cb6fd Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 27 Nov 2020 18:11:50 +0100 Subject: [PATCH 5/8] fix --- tests/models/data/horovod/train_default_model.py | 14 +++++++------- tests/models/test_horovod.py | 10 +++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index a40fa9db545ad..94daaedb4fa63 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -21,19 +21,19 @@ import os import sys -from pytorch_lightning.utilities import HOROVOD_AVAILABLE - -if HOROVOD_AVAILABLE: - import horovod.torch as hvd -else: - print('You requested to import Horovod which is missing or not supported for your OS.') - PATH_HERE = os.path.abspath(os.path.dirname(__file__)) PATH_ROOT = os.path.abspath(os.path.join(PATH_HERE, '..', '..', '..', '..')) sys.path.insert(0, os.path.abspath(PATH_ROOT)) from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 +from pytorch_lightning.utilities import HOROVOD_AVAILABLE # noqa: E402 + +if HOROVOD_AVAILABLE: + import horovod.torch as hvd # noqa: E402 +else: + print('You requested to import Horovod which is missing or not supported for your OS.') + # Move project root to the front of the search path, as some imports may have reordered things idx = sys.path.index(PATH_ROOT) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 3569683485192..b4c229d12c4c6 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -38,10 +38,18 @@ import horovod import horovod.torch as hvd -HOROVOD_NCCL_AVAILABLE = _module_available("horovod.common.util") and hasattr(horovod.common.util, "nccl_built") # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py') +if HOROVOD_AVAILABLE and _module_available("horovod.common.util"): + try: + from horovod.common.util import nccl_built + nccl_built() + except AttributeError: + HOROVOD_NCCL_AVAILABLE = False + finally: + HOROVOD_NCCL_AVAILABLE = True + def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" From 3f00512f20a69892523ecbfe28c1a1416796bab6 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 27 Nov 2020 18:24:44 +0100 Subject: [PATCH 6/8] fix --- tests/models/test_horovod.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index b4c229d12c4c6..3abbd02702fe5 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -41,14 +41,13 @@ # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py') -if HOROVOD_AVAILABLE and _module_available("horovod.common.util"): - try: - from horovod.common.util import nccl_built - nccl_built() - except AttributeError: - HOROVOD_NCCL_AVAILABLE = False - finally: - HOROVOD_NCCL_AVAILABLE = True +try: + from horovod.common.util import nccl_built + nccl_built() +except (ImportError, ModuleNotFoundError, AttributeError): + HOROVOD_NCCL_AVAILABLE = False +finally: + HOROVOD_NCCL_AVAILABLE = True def _run_horovod(trainer_options, on_gpu=False): From 337078cdcd7cedbb519c73fce2a5aeb5846ae9ce Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 28 Nov 2020 00:25:11 +0100 Subject: [PATCH 7/8] flake8 --- tests/trainer/logging_tests/test_train_loop_logging_1_0.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py index be81315562718..18c7b483b8bb5 100644 --- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py @@ -601,7 +601,6 @@ def on_train_epoch_end(self, trainer, pl_module, outputs): self.make_logging(pl_module, 'on_train_epoch_end', 9, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices) - class TestModel(BoringModel): manual_loss = [] From bf8ad48bc59f53b3c2dd98ac0e4c7bae9a2ddc46 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 3 Dec 2020 21:25:11 +0100 Subject: [PATCH 8/8] flake8 --- pytorch_lightning/accelerators/accelerator_connector.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index a0efd5196db78..9d36f76876a08 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -15,16 +15,13 @@ import torch -from pytorch_lightning.utilities import device_parser, XLA_AVAILABLE, HOROVOD_AVAILABLE -from pytorch_lightning.utilities import rank_zero_only -from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities import HOROVOD_AVAILABLE from pytorch_lightning import _logger as log from pytorch_lightning import accelerators from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment -from pytorch_lightning.utilities import XLA_AVAILABLE, device_parser, rank_zero_only, TPU_AVAILABLE +from pytorch_lightning.utilities import device_parser, rank_zero_only, TPU_AVAILABLE from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException