From 8aa920cb7bc11d8157d385d167341f019fd47550 Mon Sep 17 00:00:00 2001 From: otaj Date: Mon, 22 Aug 2022 16:29:54 +0200 Subject: [PATCH 01/12] bump cuda in docker images to 11.6.1 --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 2 +- .github/checkgroup.yml | 12 ++++++------ .github/workflows/ci-pytorch-dockers.yml | 12 ++++++------ .github/workflows/release-docker.yml | 6 +++--- dockers/README.md | 4 ++-- dockers/base-conda/Dockerfile | 2 +- dockers/base-cuda/Dockerfile | 2 +- dockers/release/Dockerfile | 2 +- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 0de590f2c54a6..968186fbd275d 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 683212cd55d4b..d3fb42d33d278 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index c2654eddd7ca1..e855e661b2b66 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -86,16 +86,16 @@ subprojects: - "build-conda (3.9, 1.11)" - "build-conda (3.9, 1.12)" - "build-cuda (3.8, 1.9, 11.1.1)" - - "build-cuda (3.9, 1.10, 11.3.1)" - - "build-cuda (3.9, 1.11, 11.3.1)" - - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.10, 11.6.1)" + - "build-cuda (3.9, 1.11, 11.6.1)" + - "build-cuda (3.9, 1.12, 11.6.1)" - "build-cuda (3.9, 1.9, 11.1.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - - "build-pl (3.9, 1.10, 11.3.1)" - - "build-pl (3.9, 1.11, 11.3.1)" - - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.10, 11.6.1)" + - "build-pl (3.9, 1.11, 11.6.1)" + - "build-pl (3.9, 1.12, 11.6.1)" - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index a05dbbb5bc8ef..e2b88157fa0b3 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -34,9 +34,9 @@ jobs: # We only release one docker image per PyTorch version. # The matrix here is the same as the one in release-docker.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -94,9 +94,9 @@ jobs: # These are the base images for PL release docker images, # so include at least all of the combinations in release-dockers.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} # Used in Lightning-AI/tutorials - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 6901a24204683..68cfe735d00ae 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -17,9 +17,9 @@ jobs: include: # We only release one docker image per PyTorch version. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/README.md b/dockers/README.md index b1ff9826b6c1f..551320ffae9f0 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.6.1 . ``` To run your docker use @@ -45,7 +45,7 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 ``` ## Run Jupyter server diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 0a7c8884974c0..914fc341694c1 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION=11.6.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 01372574e4618..adb2fdca7501f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION=11.6.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index c39e66509188c..d0db53236b729 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,7 +14,7 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION=11.6.1 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} From 602bfce224cf22e24421448887844937e0aff9f0 Mon Sep 17 00:00:00 2001 From: otaj Date: Mon, 22 Aug 2022 16:40:29 +0200 Subject: [PATCH 02/12] PUSH TO HUB. REVERT THIS! --- .github/workflows/ci-pytorch-dockers.yml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index e2b88157fa0b3..7d2596d310868 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -1,28 +1,14 @@ name: Docker on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - paths: - - "dockers/**" - - "!dockers/README.md" - - "requirements.txt" - - "requirements/*.txt" - - "requirements/pytorch/*" - - "environment.yml" - - ".github/workflows/*docker*.yml" - - "setup.py" - schedule: - - cron: "0 0 * * *" # at the end of every day + pull_request: {} concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} env: - PUSH_TO_HUB: ${{ github.event_name == 'schedule' }} + PUSH_TO_HUB: true jobs: build-pl: From 25f148b50ada7ab5ded18400e1b4a26c1b96a517 Mon Sep 17 00:00:00 2001 From: otaj Date: Mon, 22 Aug 2022 16:51:13 +0200 Subject: [PATCH 03/12] conda forge for 11.6 --- dockers/base-conda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 914fc341694c1..1935222ccfb07 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -75,7 +75,7 @@ RUN \ conda update -n base -c defaults conda && \ conda create -y --name $CONDA_ENV \ python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly -c conda-forge && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ From feac6d781ad06b923c35475f3307f8ef4473ad19 Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 10:00:29 +0200 Subject: [PATCH 04/12] cuda 11.5 --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 2 +- .github/checkgroup.yml | 12 ++++++------ .github/workflows/ci-pytorch-dockers.yml | 12 ++++++------ .github/workflows/release-docker.yml | 6 +++--- dockers/README.md | 4 ++-- dockers/base-conda/Dockerfile | 8 +++++--- dockers/base-cuda/Dockerfile | 5 +++-- dockers/release/Dockerfile | 2 +- 9 files changed, 28 insertions(+), 25 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 968186fbd275d..a46c9840f19e2 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.5.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index d3fb42d33d278..6271f2497b826 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.5.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index e855e661b2b66..a5d4d9b11ef8a 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -86,16 +86,16 @@ subprojects: - "build-conda (3.9, 1.11)" - "build-conda (3.9, 1.12)" - "build-cuda (3.8, 1.9, 11.1.1)" - - "build-cuda (3.9, 1.10, 11.6.1)" - - "build-cuda (3.9, 1.11, 11.6.1)" - - "build-cuda (3.9, 1.12, 11.6.1)" + - "build-cuda (3.9, 1.10, 11.5.1)" + - "build-cuda (3.9, 1.11, 11.5.1)" + - "build-cuda (3.9, 1.12, 11.5.1)" - "build-cuda (3.9, 1.9, 11.1.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - - "build-pl (3.9, 1.10, 11.6.1)" - - "build-pl (3.9, 1.11, 11.6.1)" - - "build-pl (3.9, 1.12, 11.6.1)" + - "build-pl (3.9, 1.10, 11.5.1)" + - "build-pl (3.9, 1.11, 11.5.1)" + - "build-pl (3.9, 1.12, 11.5.1)" - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 7d2596d310868..420a6a0594dcc 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -20,9 +20,9 @@ jobs: # We only release one docker image per PyTorch version. # The matrix here is the same as the one in release-docker.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.6.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.5.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -80,9 +80,9 @@ jobs: # These are the base images for PL release docker images, # so include at least all of the combinations in release-dockers.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.6.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.5.1"} # Used in Lightning-AI/tutorials - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 68cfe735d00ae..128a5e44825df 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -17,9 +17,9 @@ jobs: include: # We only release one docker image per PyTorch version. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.6.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.5.1"} steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/README.md b/dockers/README.md index 551320ffae9f0..4fd362fb14432 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.6.1 . +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.5.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.5.1 . ``` To run your docker use @@ -45,7 +45,7 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.5.1 ``` ## Run Jupyter server diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 1935222ccfb07..cfa1c4056dfd9 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.6.1 +ARG CUDA_VERSION=11.5.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 @@ -68,14 +68,16 @@ ENV \ TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \ CONDA_ENV=lightning + COPY environment.yml environment.yml # conda init RUN \ conda update -n base -c defaults conda && \ + CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly -c conda-forge && \ + python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ + -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index adb2fdca7501f..999295cf6c823 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=11.6.1 +ARG CUDA_VERSION=11.5.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} @@ -140,8 +140,9 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ + CUDA_VERSION_MM=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ - python -c "import bagua_core; bagua_core.install_deps()" && \ + # python -c "import bagua_core; bagua_core.install_deps()" && \ python -c "import bagua; print(bagua.__version__)" COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index d0db53236b729..15025d7ab8bf0 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,7 +14,7 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 -ARG CUDA_VERSION=11.6.1 +ARG CUDA_VERSION=11.5.1 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} From e71f12c764e7d7846caf2cdbce414d21aa2480dd Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 10:23:35 +0200 Subject: [PATCH 05/12] revert conda changes --- dockers/base-conda/Dockerfile | 6 ++---- dockers/base-cuda/Dockerfile | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index cfa1c4056dfd9..0a7c8884974c0 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.5.1 +ARG CUDA_VERSION=11.3.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 @@ -68,15 +68,13 @@ ENV \ TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \ CONDA_ENV=lightning - COPY environment.yml environment.yml # conda init RUN \ conda update -n base -c defaults conda && \ - CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ + python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 999295cf6c823..3f902b7e3628f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -142,7 +142,7 @@ RUN \ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ CUDA_VERSION_MM=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ - # python -c "import bagua_core; bagua_core.install_deps()" && \ + python -c "import bagua_core; bagua_core.install_deps()" && \ python -c "import bagua; print(bagua.__version__)" COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py From 6e729dea797d6329e463ff06e8fe67a49ebd3973 Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 11:22:15 +0200 Subject: [PATCH 06/12] 11.6 back again --- .github/workflows/ci-pytorch-dockers.yml | 12 ++++++------ .github/workflows/release-docker.yml | 6 +++--- dockers/README.md | 2 +- dockers/base-conda/Dockerfile | 2 +- dockers/base-cuda/Dockerfile | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 420a6a0594dcc..1d0b2f71e5f2e 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -20,9 +20,9 @@ jobs: # We only release one docker image per PyTorch version. # The matrix here is the same as the one in release-docker.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.5.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.5.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -80,9 +80,9 @@ jobs: # These are the base images for PL release docker images, # so include at least all of the combinations in release-dockers.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.5.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.5.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} # Used in Lightning-AI/tutorials - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 128a5e44825df..2d853dc3aeebc 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -17,9 +17,9 @@ jobs: include: # We only release one docker image per PyTorch version. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.5.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.5.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.5.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/README.md b/dockers/README.md index 4fd362fb14432..3facd99642944 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.5.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.5.1 . +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.6.1 . ``` To run your docker use diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 0a7c8884974c0..55f63957f461f 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -75,7 +75,7 @@ RUN \ conda update -n base -c defaults conda && \ conda create -y --name $CONDA_ENV \ python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + -c pytorch -c pytorch-test -c pytorch-nightly && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 3f902b7e3628f..daf1b21e97c60 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -140,9 +140,9 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_MM=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ - python -c "import bagua_core; bagua_core.install_deps()" && \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ + if [[ "$CUDA_VERSION_MM" == "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py From de224421a3f8815c126f502f75428ca325fa87d5 Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 11:26:48 +0200 Subject: [PATCH 07/12] 11.6 back again, all of them --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 2 +- .github/checkgroup.yml | 12 ++++++------ dockers/README.md | 2 +- dockers/base-cuda/Dockerfile | 2 +- dockers/release/Dockerfile | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index a46c9840f19e2..968186fbd275d 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.5.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 6271f2497b826..d3fb42d33d278 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.5.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index a5d4d9b11ef8a..2d9f7b247d38e 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -86,16 +86,16 @@ subprojects: - "build-conda (3.9, 1.11)" - "build-conda (3.9, 1.12)" - "build-cuda (3.8, 1.9, 11.1.1)" - - "build-cuda (3.9, 1.10, 11.5.1)" - - "build-cuda (3.9, 1.11, 11.5.1)" - - "build-cuda (3.9, 1.12, 11.5.1)" + - "build-cuda (3.9, 1.10, 11.3.1)" + - "build-cuda (3.9, 1.11, 11.6.1)" + - "build-cuda (3.9, 1.12, 11.6.1)" - "build-cuda (3.9, 1.9, 11.1.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - - "build-pl (3.9, 1.10, 11.5.1)" - - "build-pl (3.9, 1.11, 11.5.1)" - - "build-pl (3.9, 1.12, 11.5.1)" + - "build-pl (3.9, 1.10, 11.3.1)" + - "build-pl (3.9, 1.11, 11.6.1)" + - "build-pl (3.9, 1.12, 11.6.1)" - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" diff --git a/dockers/README.md b/dockers/README.md index 3facd99642944..551320ffae9f0 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -45,7 +45,7 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.5.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 ``` ## Run Jupyter server diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index daf1b21e97c60..91210500dcb62 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=11.5.1 +ARG CUDA_VERSION=11.3.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index 15025d7ab8bf0..c39e66509188c 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,7 +14,7 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 -ARG CUDA_VERSION=11.5.1 +ARG CUDA_VERSION=11.3.1 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} From ec30b01c02c631a0eb37aa6efdec124ff3412892 Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 13:26:16 +0200 Subject: [PATCH 08/12] maybe all passes now --- .github/checkgroup.yml | 12 ++++++------ .github/workflows/ci-pytorch-dockers.yml | 13 +++++++------ .github/workflows/release-docker.yml | 2 +- dockers/README.md | 4 ++-- dockers/base-cuda/Dockerfile | 2 +- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 2d9f7b247d38e..596218ba37bbd 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -81,20 +81,20 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.10)" - - "build-conda (3.8, 1.9)" - - "build-conda (3.9, 1.11)" - - "build-conda (3.9, 1.12)" + - "build-conda (3.8, 1.9, 11.1.1)" + - "build-conda (3.8, 1.10, 11.3.1)" + - "build-conda (3.9, 1.11, 11.3.1)" + - "build-conda (3.9, 1.12, 11.6.1)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - - "build-cuda (3.9, 1.11, 11.6.1)" + - "build-cuda (3.9, 1.11, 11.3.1)" - "build-cuda (3.9, 1.12, 11.6.1)" - "build-cuda (3.9, 1.9, 11.1.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - "build-pl (3.9, 1.10, 11.3.1)" - - "build-pl (3.9, 1.11, 11.6.1)" + - "build-pl (3.9, 1.11, 11.3.1)" - "build-pl (3.9, 1.12, 11.6.1)" - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 1d0b2f71e5f2e..013bb4fd56ce6 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -21,7 +21,7 @@ jobs: # The matrix here is the same as the one in release-docker.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 @@ -81,7 +81,7 @@ jobs: # so include at least all of the combinations in release-dockers.yml. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} # Used in Lightning-AI/tutorials - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} @@ -119,10 +119,10 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9"} - - {python_version: "3.8", pytorch_version: "1.10"} - - {python_version: "3.9", pytorch_version: "1.11"} - - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -136,6 +136,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 2d853dc3aeebc..2de330ea5ca75 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -18,7 +18,7 @@ jobs: # We only release one docker image per PyTorch version. - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - name: Checkout diff --git a/dockers/README.md b/dockers/README.md index 551320ffae9f0..4b203437ff8ab 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.6.1 . +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.12 --build-arg CUDA_VERSION=11.6.1 . ``` To run your docker use @@ -45,7 +45,7 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.6.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 ``` ## Run Jupyter server diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 91210500dcb62..be613f3b6415f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -142,7 +142,7 @@ RUN \ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ - if [[ "$CUDA_VERSION_MM" == "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ + if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py From d789c9238f3f5b937dfadd9f1cdba1bcd418d697 Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 13:53:46 +0200 Subject: [PATCH 09/12] maybe all passes now --- .github/checkgroup.yml | 8 ++++---- .github/workflows/ci-pytorch-dockers.yml | 9 ++++----- dockers/base-conda/Dockerfile | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 596218ba37bbd..8da4204228800 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -81,10 +81,10 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.9, 11.1.1)" - - "build-conda (3.8, 1.10, 11.3.1)" - - "build-conda (3.9, 1.11, 11.3.1)" - - "build-conda (3.9, 1.12, 11.6.1)" + - "build-conda (3.8, 1.9)" + - "build-conda (3.8, 1.10)" + - "build-conda (3.9, 1.11)" + - "build-conda (3.9, 1.12)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 013bb4fd56ce6..89ea5b74952ad 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -119,10 +119,10 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.8", pytorch_version: "1.9"} + - {python_version: "3.8", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} + - {python_version: "3.9", pytorch_version: "1.12"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -136,7 +136,6 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 55f63957f461f..ea85587208f78 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION=11.3 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 @@ -75,7 +75,7 @@ RUN \ conda update -n base -c defaults conda && \ conda create -y --name $CONDA_ENV \ python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c pytorch -c pytorch-test -c pytorch-nightly && \ + -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ From 0ba6eba0c9564b709705de661b294a87022b1694 Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 15:06:52 +0200 Subject: [PATCH 10/12] final push --- .github/checkgroup.yml | 8 ++++---- .github/workflows/ci-pytorch-dockers.yml | 9 +++++---- dockers/base-conda/Dockerfile | 14 ++++++++------ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 8da4204228800..79abce15fc4a8 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -81,10 +81,10 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.9)" - - "build-conda (3.8, 1.10)" - - "build-conda (3.9, 1.11)" - - "build-conda (3.9, 1.12)" + - "build-conda (3.8, 1.9, 11.1.1)" + - "build-conda (3.8, 1.10.1, 11.1.1)" + - "build-conda (3.9, 1.11, 11.3.1)" + - "build-conda (3.9, 1.12, 11.3.1)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 89ea5b74952ad..d79a7d18f7d7f 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -119,10 +119,10 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9"} - - {python_version: "3.8", pytorch_version: "1.10"} - - {python_version: "3.9", pytorch_version: "1.11"} - - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -136,6 +136,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index ea85587208f78..77984dad120da 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.3 +ARG CUDA_VERSION=11.3.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 @@ -42,16 +42,17 @@ RUN \ curl \ unzip \ ca-certificates \ - libopenmpi-dev \ - && \ + libopenmpi-dev +RUN \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ + rm ~/miniconda.sh +RUN \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -73,9 +74,10 @@ COPY environment.yml environment.yml # conda init RUN \ conda update -n base -c defaults conda && \ + CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ + -c nvidia -c pytorch && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ From 1aa6663870caf94a503a09631bc8050b3e121f9b Mon Sep 17 00:00:00 2001 From: otaj Date: Tue, 23 Aug 2022 16:51:32 +0200 Subject: [PATCH 11/12] Revert "PUSH TO HUB. REVERT THIS!" This reverts commit 602bfce224cf22e24421448887844937e0aff9f0. --- .github/workflows/ci-pytorch-dockers.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index d79a7d18f7d7f..6cb28885e79ef 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -1,14 +1,28 @@ name: Docker on: - pull_request: {} + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + paths: + - "dockers/**" + - "!dockers/README.md" + - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" + - "environment.yml" + - ".github/workflows/*docker*.yml" + - "setup.py" + schedule: + - cron: "0 0 * * *" # at the end of every day concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} env: - PUSH_TO_HUB: true + PUSH_TO_HUB: ${{ github.event_name == 'schedule' }} jobs: build-pl: From 0884fa1c931dbfd7a6d252aad4551260d2611a7a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Aug 2022 17:17:54 +0200 Subject: [PATCH 12/12] Apply suggestions from code review --- dockers/base-conda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 77984dad120da..03d2fb547ba6d 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -77,7 +77,7 @@ RUN \ CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ - -c nvidia -c pytorch && \ + -c nvidia -c pytorch -c pytorch-test && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \