diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 0de590f2c54a6..968186fbd275d 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 683212cd55d4b..d3fb42d33d278 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index c2654eddd7ca1..79abce15fc4a8 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -81,21 +81,21 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.10)" - - "build-conda (3.8, 1.9)" - - "build-conda (3.9, 1.11)" - - "build-conda (3.9, 1.12)" + - "build-conda (3.8, 1.9, 11.1.1)" + - "build-conda (3.8, 1.10.1, 11.1.1)" + - "build-conda (3.9, 1.11, 11.3.1)" + - "build-conda (3.9, 1.12, 11.3.1)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" - - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.12, 11.6.1)" - "build-cuda (3.9, 1.9, 11.1.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - "build-pl (3.9, 1.10, 11.3.1)" - "build-pl (3.9, 1.11, 11.3.1)" - - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.12, 11.6.1)" - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index a05dbbb5bc8ef..6cb28885e79ef 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -36,7 +36,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -96,7 +96,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} # Used in Lightning-AI/tutorials - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: @@ -133,10 +133,10 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9"} - - {python_version: "3.8", pytorch_version: "1.10"} - - {python_version: "3.9", pytorch_version: "1.11"} - - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -150,6 +150,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 6901a24204683..2de330ea5ca75 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -19,7 +19,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/README.md b/dockers/README.md index b1ff9826b6c1f..4b203437ff8ab 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.12 --build-arg CUDA_VERSION=11.6.1 . ``` To run your docker use @@ -45,7 +45,7 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 ``` ## Run Jupyter server diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 0a7c8884974c0..03d2fb547ba6d 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -42,16 +42,17 @@ RUN \ curl \ unzip \ ca-certificates \ - libopenmpi-dev \ - && \ + libopenmpi-dev +RUN \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ + rm ~/miniconda.sh +RUN \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -73,9 +74,10 @@ COPY environment.yml environment.yml # conda init RUN \ conda update -n base -c defaults conda && \ + CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ + -c nvidia -c pytorch -c pytorch-test && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 01372574e4618..be613f3b6415f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -140,8 +140,9 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ - python -c "import bagua_core; bagua_core.install_deps()" && \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ + if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py