From 1ae14ca7542e83db47888fd0d68324449014dfc0 Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Thu, 25 Aug 2022 19:30:06 +0200 Subject: [PATCH 1/3] [CI] fix horovod tests (#14382) --- .azure/gpu-tests.yml | 11 +++++++---- dockers/base-conda/Dockerfile | 14 ++++++++------ dockers/base-cuda/Dockerfile | 23 +++++++---------------- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f19c5bafc7814..2da30c0dd66ab 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES @@ -72,12 +72,15 @@ jobs: set -e python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" + TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0" - pip install -e .[strategies] - pip install -U deepspeed # TODO: remove when docker images are upgraded - pip install --requirement requirements/pytorch/devel.txt + pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip list env: PACKAGE_NAME: pytorch diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index d6bfeee90d561..9bb75e34b8ff6 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -34,6 +34,10 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ + NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ + CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ + MAX_ALLOWED_NCCL=2.11.4 && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ apt-get install -y --no-install-recommends \ build-essential \ cmake \ @@ -42,17 +46,15 @@ RUN \ curl \ unzip \ ca-certificates \ - libopenmpi-dev - -RUN \ + libopenmpi-dev \ + libnccl2=$TO_INSTALL_NCCL \ + libnccl-dev=$TO_INSTALL_NCCL && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ - rm ~/miniconda.sh - -RUN \ + rm ~/miniconda.sh && \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index be613f3b6415f..08692ff00ab78 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -37,7 +37,11 @@ RUN \ # https://github.com/NVIDIA/nvidia-docker/issues/1631 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ apt-get update -qq --fix-missing && \ - apt-get install -y --no-install-recommends \ + NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ + CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ + MAX_ALLOWED_NCCL=2.11.4 && \ + TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ + apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ cmake \ @@ -50,8 +54,8 @@ RUN \ libopenmpi-dev \ openmpi-bin \ ssh \ - && \ - + libnccl2=$TO_INSTALL_NCCL \ + libnccl-dev=$TO_INSTALL_NCCL && \ # Install python add-apt-repository ppa:deadsnakes/ppa && \ apt-get install -y \ @@ -59,10 +63,8 @@ RUN \ python${PYTHON_VERSION}-distutils \ python${PYTHON_VERSION}-dev \ && \ - update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ - # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -78,7 +80,6 @@ RUN \ wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ python${PYTHON_VERSION} get-pip.py && \ rm get-pip.py && \ - pip install -q fire && \ # Disable cache \ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ @@ -91,16 +92,6 @@ RUN \ pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \ rm assistant.py -RUN \ - apt-get purge -y cmake && \ - wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \ - tar -zxvf cmake-3.20.2.tar.gz && \ - cd cmake-3.20.2 && \ - ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \ - make && \ - make install && \ - cmake --version - ENV \ HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ HOROVOD_GPU_OPERATIONS=NCCL \ From 807435885ea265580fee9f4e69c063eace46def2 Mon Sep 17 00:00:00 2001 From: Tanmoy Date: Fri, 26 Aug 2022 00:27:48 +0530 Subject: [PATCH 2/3] Fix `LightningDataModule` hparams parsing (#12806) Co-authored-by: Akihiro Nitta Co-authored-by: Jirka Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/utilities/parsing.py | 13 ++-- .../tuner/test_scale_batch_size.py | 69 ++++++++++++------- tests/tests_pytorch/utilities/test_parsing.py | 22 ++++-- 4 files changed, 71 insertions(+), 36 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 07c34bbc0e579..642cb28d4db4c 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) +- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) + + ## [1.7.2] - 2022-08-17 ### Added diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 073423ab60773..22dfb538828ab 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -321,14 +321,17 @@ def _lightning_get_all_attr_holders(model: "pl.LightningModule", attribute: str) holders.append(model) # Check if attribute in model.hparams, either namespace or dict - if hasattr(model, "hparams"): - if attribute in model.hparams: - holders.append(model.hparams) + if hasattr(model, "hparams") and attribute in model.hparams: + holders.append(model.hparams) trainer = model._trainer # Check if the attribute in datamodule (datamodule gets registered in Trainer) - if trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute): - holders.append(trainer.datamodule) + if trainer is not None and trainer.datamodule is not None: + if hasattr(trainer.datamodule, attribute): + holders.append(trainer.datamodule) + + if hasattr(trainer.datamodule, "hparams") and attribute in trainer.datamodule.hparams: + holders.append(trainer.datamodule.hparams) return holders diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py index d2fc8a61e0107..ce7c3613f5012 100644 --- a/tests/tests_pytorch/tuner/test_scale_batch_size.py +++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py @@ -29,8 +29,8 @@ class BatchSizeDataModule(BoringDataModule): - def __init__(self, batch_size): - super().__init__() + def __init__(self, data_dir, batch_size): + super().__init__(data_dir) if batch_size is not None: self.batch_size = batch_size @@ -58,7 +58,7 @@ def test_scale_batch_size_method_with_model_or_datamodule(tmpdir, model_bs, dm_b tuner = Tuner(trainer) model = BatchSizeModel(model_bs) - datamodule = BatchSizeDataModule(dm_bs) if dm_bs != -1 else None + datamodule = BatchSizeDataModule(tmpdir, dm_bs) if dm_bs != -1 else None new_batch_size = tuner.scale_batch_size(model, mode="binsearch", init_val=4, max_trials=2, datamodule=datamodule) assert new_batch_size == 16 @@ -140,47 +140,64 @@ def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg): assert not os.path.exists(tmpdir / "scale_batch_size_temp_model.ckpt") -@RunIf(min_cuda_gpus=1) @pytest.mark.parametrize("use_hparams", [True, False]) def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams): - """Test that new batch size gets written to the correct hyperparameter attribute.""" + """Test that new batch size gets written to the correct hyperparameter attribute for model.""" tutils.reset_seed() hparams = {"batch_size": 2} - before_batch_size = hparams.get("batch_size") + before_batch_size = hparams["batch_size"] - class HparamsBatchSizeModel(BatchSizeModel): + class HparamsBatchSizeModel(BoringModel): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__() self.save_hyperparameters() - def dataloader(self, *args, **kwargs): - # artificially set batch_size so we can get a dataloader - # remove it immediately after, because we want only self.hparams.batch_size - setattr(self, "batch_size", before_batch_size) - dataloader = super().dataloader(*args, **kwargs) - del self.batch_size - return dataloader + def train_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + + def val_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + + model_class = HparamsBatchSizeModel if use_hparams else BatchSizeModel + model = model_class(**hparams) + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True) + trainer.tune(model, scale_batch_size_kwargs={"steps_per_trial": 2, "max_trials": 4}) + after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size + assert before_batch_size != after_batch_size + assert after_batch_size <= len(trainer.train_dataloader.dataset) + + +@pytest.mark.parametrize("use_hparams", [True, False]) +def test_auto_scale_batch_size_set_datamodule_attribute(tmpdir, use_hparams): + """Test that new batch size gets written to the correct hyperparameter attribute for datamodule.""" + tutils.reset_seed() + + hparams = {"batch_size": 2} + before_batch_size = hparams["batch_size"] class HparamsBatchSizeDataModule(BoringDataModule): def __init__(self, data_dir, batch_size): super().__init__(data_dir) - self.batch_size = batch_size + self.save_hyperparameters() def train_dataloader(self): - return DataLoader(self.random_train, batch_size=self.batch_size) + return DataLoader(self.random_train, batch_size=self.hparams.batch_size) - datamodule_fit = HparamsBatchSizeDataModule(data_dir=tmpdir, batch_size=before_batch_size) - model_class = HparamsBatchSizeModel if use_hparams else BatchSizeModel - model = model_class(**hparams) + def val_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True, accelerator="gpu", devices=1) - trainer.tune(model, datamodule_fit) - after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size - assert trainer.datamodule == datamodule_fit - assert before_batch_size != after_batch_size + datamodule_class = HparamsBatchSizeDataModule if use_hparams else BatchSizeDataModule + datamodule = datamodule_class(data_dir=tmpdir, batch_size=before_batch_size) + model = BatchSizeModel(**hparams) + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True) + trainer.tune(model, datamodule=datamodule, scale_batch_size_kwargs={"steps_per_trial": 2, "max_trials": 4}) + after_batch_size = datamodule.hparams.batch_size if use_hparams else datamodule.batch_size + assert trainer.datamodule == datamodule + assert before_batch_size < after_batch_size assert after_batch_size <= len(trainer.train_dataloader.dataset) - assert datamodule_fit.batch_size == after_batch_size def test_auto_scale_batch_size_duplicate_attribute_warning(tmpdir): diff --git a/tests/tests_pytorch/utilities/test_parsing.py b/tests/tests_pytorch/utilities/test_parsing.py index e918c9df2ac32..98b00a374d778 100644 --- a/tests/tests_pytorch/utilities/test_parsing.py +++ b/tests/tests_pytorch/utilities/test_parsing.py @@ -64,8 +64,8 @@ class TestModel4(LightningModule): # fail case batch_size = 1 model4 = TestModel4() - trainer = Trainer() + model4.trainer = trainer datamodule = LightningDataModule() datamodule.batch_size = 8 trainer.datamodule = datamodule @@ -87,12 +87,21 @@ class TestModel7(LightningModule): # test for datamodule w/ hparams w/ attribut model7 = TestModel7() model7.trainer = trainer - return model1, model2, model3, model4, model5, model6, model7 + class TestDataModule8(LightningDataModule): # test for hparams dict + hparams = TestHparamsDict2 + + model8 = TestModel1() + trainer = Trainer() + model8.trainer = trainer + datamodule = TestDataModule8() + trainer.datamodule = datamodule + + return model1, model2, model3, model4, model5, model6, model7, model8 def test_lightning_hasattr(): """Test that the lightning_hasattr works in all cases.""" - model1, model2, model3, model4, model5, model6, model7 = models = model_cases() + model1, model2, model3, model4, model5, model6, model7, model8 = models = model_cases() assert lightning_hasattr(model1, "learning_rate"), "lightning_hasattr failed to find namespace variable" assert lightning_hasattr(model2, "learning_rate"), "lightning_hasattr failed to find hparams namespace variable" assert lightning_hasattr(model3, "learning_rate"), "lightning_hasattr failed to find hparams dict variable" @@ -104,6 +113,7 @@ def test_lightning_hasattr(): assert lightning_hasattr( model7, "batch_size" ), "lightning_hasattr failed to find batch_size in hparams w/ datamodule present" + assert lightning_hasattr(model8, "batch_size") for m in models: assert not lightning_hasattr(m, "this_attr_not_exist") @@ -116,10 +126,11 @@ def test_lightning_getattr(): value = lightning_getattr(m, "learning_rate") assert value == i, "attribute not correctly extracted" - model5, model6, model7 = models[4:] + model5, model6, model7, model8 = models[4:] assert lightning_getattr(model5, "batch_size") == 8, "batch_size not correctly extracted" assert lightning_getattr(model6, "batch_size") == 8, "batch_size not correctly extracted" assert lightning_getattr(model7, "batch_size") == 8, "batch_size not correctly extracted" + assert lightning_getattr(model8, "batch_size") == 2, "batch_size not correctly extracted" for m in models: with pytest.raises( @@ -136,13 +147,14 @@ def test_lightning_setattr(tmpdir): lightning_setattr(m, "learning_rate", 10) assert lightning_getattr(m, "learning_rate") == 10, "attribute not correctly set" - model5, model6, model7 = models[4:] + model5, model6, model7, model8 = models[4:] lightning_setattr(model5, "batch_size", 128) lightning_setattr(model6, "batch_size", 128) lightning_setattr(model7, "batch_size", 128) assert lightning_getattr(model5, "batch_size") == 128, "batch_size not correctly set" assert lightning_getattr(model6, "batch_size") == 128, "batch_size not correctly set" assert lightning_getattr(model7, "batch_size") == 128, "batch_size not correctly set" + assert lightning_getattr(model8, "batch_size") == 128, "batch_size not correctly set" for m in models: with pytest.raises( From 33a5ed98794943b7eb6c7fcfa078b184c9d4d736 Mon Sep 17 00:00:00 2001 From: Anner Date: Fri, 26 Aug 2022 06:26:00 +0100 Subject: [PATCH 3/3] Add torch.cuda rng state to seed save/load (#14384) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 4 +++- src/pytorch_lightning/utilities/seed.py | 15 ++++++++++++--- tests/tests_pytorch/utilities/test_seed.py | 20 +++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 642cb28d4db4c..ac7e68d177fbe 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -27,7 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) -- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([14023](https://github.com/Lightning-AI/lightning/pull/14023)) +- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([#14023](https://github.com/Lightning-AI/lightning/pull/14023)) + +- Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384)) diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index 8fce6a1debfcf..925337c7845ae 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -121,13 +121,22 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: def _collect_rng_states() -> Dict[str, Any]: - """Collect the global random state of :mod:`torch`, :mod:`numpy` and Python.""" - return {"torch": torch.get_rng_state(), "numpy": np.random.get_state(), "python": python_get_rng_state()} + """Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python.""" + return { + "torch": torch.get_rng_state(), + "torch.cuda": torch.cuda.get_rng_state_all(), + "numpy": np.random.get_state(), + "python": python_get_rng_state(), + } def _set_rng_states(rng_state_dict: Dict[str, Any]) -> None: - """Set the global random state of :mod:`torch`, :mod:`numpy` and Python in the current process.""" + """Set the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python in the current + process.""" torch.set_rng_state(rng_state_dict["torch"]) + # torch.cuda rng_state is only included since v1.8. + if "torch.cuda" in rng_state_dict: + torch.cuda.set_rng_state_all(rng_state_dict["torch.cuda"]) np.random.set_state(rng_state_dict["numpy"]) version, state, gauss = rng_state_dict["python"] python_set_rng_state((version, tuple(state), gauss)) diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index 6908badf1a037..c8df824e93b41 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -9,7 +9,7 @@ import torch import pytorch_lightning.utilities.seed as seed_utils -from pytorch_lightning.utilities.seed import isolate_rng +from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states, isolate_rng @mock.patch.dict(os.environ, {}, clear=True) @@ -87,6 +87,13 @@ def test_isolate_rng(): generated = [torch.rand(2) for _ in range(3)] assert torch.equal(torch.rand(2), generated[0]) + # torch.cuda + if torch.cuda.is_available(): + torch.cuda.FloatTensor(1).normal_() + with isolate_rng(): + generated = [torch.cuda.FloatTensor(2).normal_() for _ in range(3)] + assert torch.equal(torch.cuda.FloatTensor(2).normal_(), generated[0]) + # numpy np.random.rand(1) with isolate_rng(): @@ -100,6 +107,17 @@ def test_isolate_rng(): assert random.random() == generated[0] +def test_backward_compatibility_rng_states_dict(): + """Test that an older rng_states_dict without the "torch.cuda" key does not crash. + + This test is only relevant when torch.cuda is available. + """ + states = _collect_rng_states() + assert "torch.cuda" in states + states.pop("torch.cuda") + _set_rng_states(states) + + @mock.patch("pytorch_lightning.utilities.seed.log.info") @pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]):