Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .azure/gpu-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
workspace:
clean: all
Expand Down
2 changes: 1 addition & 1 deletion .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
'PyTorch - stable':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down
12 changes: 6 additions & 6 deletions .github/checkgroup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,21 @@ subprojects:
- ".github/workflows/*docker*.yml"
- "setup.py"
checks:
- "build-conda (3.8, 1.10)"
- "build-conda (3.8, 1.9)"
- "build-conda (3.9, 1.11)"
- "build-conda (3.9, 1.12)"
- "build-conda (3.8, 1.9, 11.1.1)"
- "build-conda (3.8, 1.10.1, 11.1.1)"
- "build-conda (3.9, 1.11, 11.3.1)"
- "build-conda (3.9, 1.12, 11.3.1)"
- "build-cuda (3.8, 1.9, 11.1.1)"
- "build-cuda (3.9, 1.10, 11.3.1)"
- "build-cuda (3.9, 1.11, 11.3.1)"
- "build-cuda (3.9, 1.12, 11.3.1)"
- "build-cuda (3.9, 1.12, 11.6.1)"
- "build-cuda (3.9, 1.9, 11.1.1)"
- "build-hpu (1.5.0, 1.11.0)"
- "build-ipu (3.9, 1.9)"
- "build-NGC"
- "build-pl (3.9, 1.10, 11.3.1)"
- "build-pl (3.9, 1.11, 11.3.1)"
- "build-pl (3.9, 1.12, 11.3.1)"
- "build-pl (3.9, 1.12, 11.6.1)"
- "build-pl (3.9, 1.9, 11.1.1)"
- "build-xla (3.7, 1.12)"

Expand Down
13 changes: 7 additions & 6 deletions .github/workflows/ci-pytorch-dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
Expand Down Expand Up @@ -96,7 +96,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
# Used in Lightning-AI/tutorials
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
steps:
Expand Down Expand Up @@ -133,10 +133,10 @@ jobs:
fail-fast: false
matrix:
include:
- {python_version: "3.8", pytorch_version: "1.9"}
- {python_version: "3.8", pytorch_version: "1.10"}
- {python_version: "3.9", pytorch_version: "1.11"}
- {python_version: "3.9", pytorch_version: "1.12"}
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
Expand All @@ -150,6 +150,7 @@ jobs:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
file: dockers/base-conda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
4 changes: 2 additions & 2 deletions dockers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git
docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile .

# build with specific arguments
docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 .
docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.12 --build-arg CUDA_VERSION=11.6.1 .
```

To run your docker use
Expand Down Expand Up @@ -45,7 +45,7 @@ sudo systemctl restart docker
and later run the docker image with `--gpus all`. For example,

```
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1
```

## Run Jupyter server
Expand Down
12 changes: 7 additions & 5 deletions dockers/base-conda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,17 @@ RUN \
curl \
unzip \
ca-certificates \
libopenmpi-dev \
&& \
libopenmpi-dev

RUN \
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh && \
rm ~/miniconda.sh

RUN \
# Cleaning
apt-get autoremove -y && \
apt-get clean && \
Expand All @@ -73,9 +74,10 @@ COPY environment.yml environment.yml
# conda init
RUN \
conda update -n base -c defaults conda && \
CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
conda create -y --name $CONDA_ENV \
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \
-c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
-c nvidia -c pytorch -c pytorch-test && \
conda init bash && \
# NOTE: this requires that the channel is presented in the yaml before packages \
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
Expand Down
5 changes: 3 additions & 2 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ RUN \
RUN \
# install Bagua
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \
python -c "import bagua_core; bagua_core.install_deps()" && \
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \
if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
python -c "import bagua; print(bagua.__version__)"

COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py
Expand Down