Skip to content
9 changes: 3 additions & 6 deletions pytorch_lightning/accelerators/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,18 @@

import torch

from pytorch_lightning.utilities import HOROVOD_AVAILABLE
from pytorch_lightning import _logger as log
from pytorch_lightning import accelerators
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.utilities import XLA_AVAILABLE, device_parser, rank_zero_only, TPU_AVAILABLE
from pytorch_lightning.utilities import device_parser, rank_zero_only, TPU_AVAILABLE
from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn
from pytorch_lightning.utilities.exceptions import MisconfigurationException

try:
if HOROVOD_AVAILABLE:
import horovod.torch as hvd
except (ModuleNotFoundError, ImportError):
HOROVOD_AVAILABLE = False
else:
HOROVOD_AVAILABLE = True


class AcceleratorConnector:
Expand Down
8 changes: 2 additions & 6 deletions pytorch_lightning/accelerators/horovod_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,11 @@
from torch.optim.lr_scheduler import _LRScheduler

from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
from pytorch_lightning.utilities import AMPType
from pytorch_lightning.utilities import AMPType, HOROVOD_AVAILABLE
from pytorch_lightning.utilities.distributed import rank_zero_only

try:
if HOROVOD_AVAILABLE:
import horovod.torch as hvd
except (ModuleNotFoundError, ImportError):
HOROVOD_AVAILABLE = False
else:
HOROVOD_AVAILABLE = True


class HorovodAccelerator(Accelerator):
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -1670,7 +1670,7 @@ def __get_hparams_assignment_variable(self):
line = re.sub(r"\s+", "", line, flags=re.UNICODE)
if ".hparams=" in line:
return line.split("=")[1]
except Exception as e:
except Exception:
return "hparams"

return None
10 changes: 3 additions & 7 deletions pytorch_lightning/trainer/data_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
import platform
from abc import ABC
from copy import deepcopy
from typing import Callable, Iterable, List, Optional, Tuple, Union
from typing import Union, List, Tuple, Callable, Optional, Iterable

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.core import LightningModule
from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_warn
from pytorch_lightning.utilities import rank_zero_warn, TPU_AVAILABLE, HOROVOD_AVAILABLE
from pytorch_lightning.utilities.data import has_iterable_dataset, has_len
from pytorch_lightning.utilities.debugging import InternalDebugger
from pytorch_lightning.utilities.exceptions import MisconfigurationException
Expand All @@ -32,12 +32,8 @@
if TPU_AVAILABLE:
import torch_xla.core.xla_model as xm

try:
if HOROVOD_AVAILABLE:
import horovod.torch as hvd
except (ModuleNotFoundError, ImportError):
HOROVOD_AVAILABLE = False
else:
HOROVOD_AVAILABLE = True


class TrainerDataLoadingMixin(ABC):
Expand Down
1 change: 1 addition & 0 deletions pytorch_lightning/utilities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _module_available(module_path: str) -> bool:
NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")
OMEGACONF_AVAILABLE = _module_available("omegaconf")
HYDRA_AVAILABLE = _module_available("hydra")
HOROVOD_AVAILABLE = _module_available("horovod.torch")

TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists()
FAIRSCALE_AVAILABLE = platform.system() != 'Windows' and _module_available('fairscale.nn.data_parallel')
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/utilities/debugging.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def track_load_dataloader_call(self, name, dataloaders):
for dl in dataloaders:
try:
length = len(dl)
except Exception as e:
except Exception:
length = -1
lengths.append(length)

Expand Down
13 changes: 7 additions & 6 deletions tests/models/data/horovod/train_default_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,19 @@
import os
import sys


try:
import horovod.torch as hvd
except ImportError:
print('You requested to import Horovod which is missing or not supported for your OS.')

PATH_HERE = os.path.abspath(os.path.dirname(__file__))
PATH_ROOT = os.path.abspath(os.path.join(PATH_HERE, '..', '..', '..', '..'))
sys.path.insert(0, os.path.abspath(PATH_ROOT))

from pytorch_lightning import Trainer # noqa: E402
from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402
from pytorch_lightning.utilities import HOROVOD_AVAILABLE # noqa: E402

if HOROVOD_AVAILABLE:
import horovod.torch as hvd # noqa: E402
else:
print('You requested to import Horovod which is missing or not supported for your OS.')


# Move project root to the front of the search path, as some imports may have reordered things
idx = sys.path.index(PATH_ROOT)
Expand Down
44 changes: 15 additions & 29 deletions tests/models/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,33 +29,25 @@
from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator
from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult
from pytorch_lightning.metrics.classification.accuracy import Accuracy
from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE
from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE, _module_available
from tests.base import EvalModelTemplate
from tests.base.boring_model import BoringModel
from tests.base.models import BasicGAN

try:
if HOROVOD_AVAILABLE:
import horovod
from horovod.common.util import nccl_built
except ImportError:
HOROVOD_AVAILABLE = False
else:
HOROVOD_AVAILABLE = True

import horovod.torch as hvd

# This script will run the actual test model training in parallel
TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py')


def _nccl_available():
if not HOROVOD_AVAILABLE:
return False

try:
return nccl_built()
except AttributeError:
# Horovod 0.19.1 nccl_built() does not yet work with Python 3.8:
# See: https://github.com/horovod/horovod/issues/1891
return False
try:
from horovod.common.util import nccl_built
nccl_built()
except (ImportError, ModuleNotFoundError, AttributeError):
HOROVOD_NCCL_AVAILABLE = False
finally:
HOROVOD_NCCL_AVAILABLE = True


def _run_horovod(trainer_options, on_gpu=False):
Expand Down Expand Up @@ -114,7 +106,7 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):


@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_horovod_multi_gpu(tmpdir):
"""Test Horovod with multi-GPU support."""
Expand All @@ -134,7 +126,7 @@ def test_horovod_multi_gpu(tmpdir):


@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex")
def test_horovod_apex(tmpdir):
Expand All @@ -158,7 +150,7 @@ def test_horovod_apex(tmpdir):

@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="test requires torch.cuda.amp")
def test_horovod_amp(tmpdir):
Expand All @@ -181,7 +173,7 @@ def test_horovod_amp(tmpdir):


@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(not HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
def test_horovod_transfer_batch_to_gpu(tmpdir):
class TestTrainingStepModel(EvalModelTemplate):
Expand Down Expand Up @@ -263,10 +255,6 @@ def hvd_test_fn():
path_root = os.path.abspath(os.path.join(path_here, '..', '..'))
sys.path.insert(0, os.path.abspath(path_root))

import horovod.torch as hvd

from tests.base.boring_model import BoringModel

class TestModel(BoringModel):
def training_step(self, batch, batch_idx):
self.training_step_called = True
Expand Down Expand Up @@ -318,8 +306,6 @@ def sk_metric(preds, target):
target = torch.randint(high=2, size=(num_batches, batch_size))

def _compute_batch():
import horovod.torch as hvd

trainer = Trainer(
fast_dev_run=True,
distributed_backend='horovod',
Expand Down