Merge branch 'master' into feature/oss-state-outside-ddp

Lightning-AI · Aug 26, 2022 · fd4cab8 · fd4cab8
2 parents 391700f + a01e016
commit fd4cab8
Show file tree

Hide file tree

Showing 17 changed files with 213 additions and 113 deletions.
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
@@ -44,7 +44,7 @@ jobs:
 
     - bash: |
         CHANGED_FILES=$(git diff --name-status origin/master -- . | awk  '{print $2}')
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+        FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
         echo $CHANGED_FILES > changed_files.txt
         MATCHES=$(cat changed_files.txt | grep -E $FILTER)
         echo $MATCHES
@@ -72,11 +72,15 @@ jobs:
         set -e
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
-        pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
-        pip install -e .[strategies]
-        pip install -U deepspeed  # TODO: remove when docker images are upgraded
-        pip install --requirement requirements/pytorch/devel.txt
+        CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
+        pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
+        pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
+        pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
         pip list
       env:
         PACKAGE_NAME: pytorch

diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -1,20 +1,14 @@
 custom_service_name: "Lightning CI required checker"
+# For security reasons, configuration is only loaded from the repository's default branch,
+# changes made in pull requests from different branches or forks are ignored. This means that changes to this file
+# will only be used after they are merged.
 subprojects:
   - id: "CI: CircleCI"
     paths:
       - ".circleci/**"
     checks:
       - "test-on-tpus"
 
-  - id: "CI: Azure"
-    paths:
-      - ".azure/**"
-    checks:
-      - "pytorch-lightning (GPUs)"
-      - "pytorch-lightning (GPUs) (testing PyTorch - stable)"
-      - "pytorch-lightning (HPUs)"
-      - "pytorch-lightning (IPUs)"
-
   - id: "pytorch_lightning"
     paths:
       # all examples don't need to be added because they aren't used in CI, but these are
@@ -52,14 +46,32 @@ subprojects:
       - "mypy"
       - "PR Gatekeeper (pytorch)"
       - "pytorch-lightning (GPUs)"
-      - "pytorch-lightning (GPUs) (testing PyTorch - stable)"
       - "pytorch-lightning (HPUs)"
       - "pytorch-lightning (IPUs)"
       - "slow (macOS-11, 3.7, 1.11)"
       - "slow (ubuntu-20.04, 3.7, 1.11)"
       - "slow (windows-2022, 3.7, 1.11)"
       - "test-on-tpus"
 
+  - id: "pytorch_lightning: Azure GPU"
+    paths:
+      - ".azure/gpu-tests.yml"
+      - "tests/tests_pytorch/run_standalone_*.sh"
+    checks:
+      - "pytorch-lightning (GPUs)"
+
+  - id: "pytorch_lightning: Azure HPU"
+    paths:
+      - ".azure/hpu-tests.yml"
+    checks:
+      - "pytorch-lightning (HPUs)"
+
+  - id: "pytorch_lightning: Azure IPU"
+    paths:
+      - ".azure/ipu-tests.yml"
+    checks:
+      - "pytorch-lightning (IPUs)"
+
   - id: "pytorch_lightning: Docs"
     paths:
       - "docs/source-pytorch/**"
@@ -73,7 +85,6 @@ subprojects:
   - id: "pytorch_lightning: Docker"
     paths:
       - "dockers/**"
-      - "!dockers/README.md"
       - "requirements.txt"
       - "requirements/*.txt"
       - "requirements/pytorch/*"
@@ -108,12 +119,10 @@ subprojects:
 
   - id: "lightning_app"
     paths:
-      - ".azure/app-cloud-e2e.yml"
       - "requirements/app/**"
       - "src/lightning_app/**"
       - "tests/tests_app/**"
       - "tests/tests_app_examples/**"
-      - "tests/tests_clusters/**"
       # the examples are used in the app CI
       - "examples/app_*"
     checks:
@@ -127,6 +136,12 @@ subprojects:
       - "pytest (windows-2022, 3.8, latest)"
       - "pytest (windows-2022, 3.8, oldest)"
 
+  - id: "lightning_app: Azure"
+    paths:
+      - ".azure/app-cloud-e2e.yml"
+    checks:
+      - "App.cloud-e2e"
+
   - id: "lightning_app: Docs"
     paths:
       - "docs/source-app/**"

diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml
@@ -0,0 +1,15 @@
+name: Probot
+
+on:
+  check_run: {}
+  pull_request: {}
+  issue_comment: {types: [created]}
+
+jobs:
+  required-jobs:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'issue_comment' || contains(github.event.comment.body, '@probot pls')
+    steps:
+      - uses: carmocca/probot@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
@@ -34,6 +34,10 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
     apt-get install -y --no-install-recommends \
         build-essential \
         cmake \
@@ -42,17 +46,15 @@ RUN \
         curl \
         unzip \
         ca-certificates \
-        libopenmpi-dev
-
-RUN \
+        libopenmpi-dev \
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
     curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
-    rm ~/miniconda.sh
-
-RUN \
+    rm ~/miniconda.sh && \
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -141,8 +143,9 @@ RUN \
 RUN \
     # install Bagua
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
-    pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \
-    python -c "import bagua_core; bagua_core.install_deps()" && \
+    CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
+    pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \
+    if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
     python -c "import bagua; print(bagua.__version__)"
 
 RUN \

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -37,7 +37,11 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
-    apt-get install -y --no-install-recommends \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
+    apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \
         cmake \
@@ -50,19 +54,17 @@ RUN \
         libopenmpi-dev \
         openmpi-bin \
         ssh \
-    && \
-
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install python
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get install -y \
         python${PYTHON_VERSION} \
         python${PYTHON_VERSION}-distutils \
         python${PYTHON_VERSION}-dev \
     && \
-
     update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
     update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
-
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -78,7 +80,6 @@ RUN \
     wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
     python${PYTHON_VERSION} get-pip.py && \
     rm get-pip.py && \
-
     pip install -q fire && \
     # Disable cache \
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
@@ -91,16 +92,6 @@ RUN \
     pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
     rm assistant.py
 
-RUN \
-    apt-get purge -y cmake && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
-    tar -zxvf cmake-3.20.2.tar.gz && \
-    cd cmake-3.20.2 && \
-    ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
-    make && \
-    make install && \
-    cmake  --version
-
 ENV \
     HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
     HOROVOD_GPU_OPERATIONS=NCCL \

diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -37,6 +37,7 @@ local tputests = base.BaseTest {
       export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
       export PL_RUN_TPU_TESTS=1
       cd tests/tests_pytorch
+      set -e
       coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
       echo "\n||| Running standalone tests |||\n"
       bash run_standalone_tests.sh

diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst
@@ -47,6 +47,40 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy`
 
 ----
 
+Scale-out on Gaudis
+-------------------
+
+To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2)
+
+In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details.
+
+- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
+- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
+- *WORLD_SIZE* - required; how many workers are in the cluster
+- *NODE_RANK* - required; id of the node in the cluster
+
+The trainer needs to be instantiated on every node participating in the training.
+
+On Node 1:
+
+.. code-block:: bash
+
+    MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=0 WORLD_SIZE=16
+        python -m some_model_trainer.py (--arg1 ... train script args...)
+
+On Node 2:
+
+.. code-block:: bash
+
+    MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=1 WORLD_SIZE=16
+        python -m some_model_trainer.py (--arg1 ... train script args...)
+
+----
+
 Select Gaudis automatically
 ---------------------------
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 # the default package dependencies
+-r ./requirements/app/base.txt
 -r ./requirements/pytorch/base.txt
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
@@ -9,49 +9,23 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand`  ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
-
-
 - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835))
-
-
 - Add support to see Lightning AI BYOC cluster logs ([#14334](https://github.com/Lightning-AI/lightning/pull/14334))
-
-
 - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894))
-
-
 - Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987))
-
-
 - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
-
-
 - Add support for printing application logs using CLI `lightning show logs <app_name> [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634))
-
-
 - Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945))
-
-
 - Added a warning when `configure_layout` returns URLs configured with http instead of https ([#14233](https://github.com/Lightning-AI/lightning/pull/14233))
 
-
 ### Changed
 
 - Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132))
 
-
 ### Changed
 
--
-
-
 - Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076))
 
-### Deprecated
-
--
-
-
 ### Fixed
 
 - Unification of app template: moved `app.py` to root dir for `lightning init app <app_name>` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853))

diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py
@@ -1 +1 @@
-version = "0.6.0dev"
+version = "0.6.0rc0"
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -30,7 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738))
 
 
-- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([14023](https://github.com/Lightning-AI/lightning/pull/14023))
+- Enabled `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` ([#14023](https://github.com/Lightning-AI/lightning/pull/14023))
+
+- Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384))
 
 
 
@@ -85,6 +87,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296))
 
 
+- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
+
+
 ## [1.7.2] - 2022-08-17
 
 ### Added

diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py
@@ -321,14 +321,17 @@ def _lightning_get_all_attr_holders(model: "pl.LightningModule", attribute: str)
         holders.append(model)
 
     # Check if attribute in model.hparams, either namespace or dict
-    if hasattr(model, "hparams"):
-        if attribute in model.hparams:
-            holders.append(model.hparams)
+    if hasattr(model, "hparams") and attribute in model.hparams:
+        holders.append(model.hparams)
 
     trainer = model._trainer
     # Check if the attribute in datamodule (datamodule gets registered in Trainer)
-    if trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute):
-        holders.append(trainer.datamodule)
+    if trainer is not None and trainer.datamodule is not None:
+        if hasattr(trainer.datamodule, attribute):
+            holders.append(trainer.datamodule)
+
+        if hasattr(trainer.datamodule, "hparams") and attribute in trainer.datamodule.hparams:
+            holders.append(trainer.datamodule.hparams)
 
     return holders