Skip to content

Commit

Permalink
Pytorch 2 support (#1513)
Browse files Browse the repository at this point in the history
* adding testing statements

* adding scripts to run model with pytorch2.0

* small fixes and adding licence

* dino model change to support pytorch 2.0

* dino model change to support pytorch 2.0

* small fix

* refactoring

* refactoring

* refactoring

* small fix

* small fix

* adding comments

* fixing comments

* fixing comments

* fixing comments

* removing python version

* removing python version

* removing python version
  • Loading branch information
Tulsishah committed Nov 27, 2023
1 parent 03a98d8 commit 3a66e45
Show file tree
Hide file tree
Showing 16 changed files with 280 additions and 69 deletions.
14 changes: 0 additions & 14 deletions perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This will stop execution when any command will have non-zero status.
set -e

VM_NAME="pytorch-dino-7d"
ZONE_NAME="us-west1-b"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v1_12"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh"

# 2 hours timeout.
timeout_mins: 60
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This will stop execution when any command will have non-zero status.
set -e

VM_NAME="pytorch2-dino-7d"
ZONE_NAME="us-west1-a"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh"

# 2 hours timeout.
timeout_mins: 60
50 changes: 32 additions & 18 deletions perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ ZONE_NAME=$2
ARTIFACTS_BUCKET_PATH=$3
# Path of test script relative to $HOME inside test VM.
TEST_SCRIPT_PATH=$4
# pytorch version
PYTORCH_VERSION=$5
MACHINE_TYPE="a2-highgpu-2g"
ACCELERATOR="count=2,type=nvidia-tesla-a100"
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"

function initialize_ssh_key () {
echo "Delete existing ssh keys "
Expand Down Expand Up @@ -55,26 +60,35 @@ function delete_existing_vm_and_create_new () {
echo "Wait for 30 seconds for old VM to be deleted"
sleep 30s

# NVIDIA A100 40GB GPU type machine is currently unavailable due to global shortage.
# Create NVIDIA L4 machines which are available on us-west1-1 zone.
if [ $PYTORCH_VERSION == "v2" ];
then
MACHINE_TYPE="g2-standard-24"
ACCELERATOR="count=2,type=nvidia-l4"
RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests"
fi

echo "Creating VM $VM_NAME in zone $ZONE_NAME"
# The below command creates VM using the reservation 'ai-ml-tests'
sudo gcloud compute instances create $VM_NAME \
--project=$GCP_PROJECT\
--zone=$ZONE_NAME \
--machine-type=a2-highgpu-2g \
--network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
--metadata=enable-osconfig=TRUE,enable-oslogin=true \
--maintenance-policy=TERMINATE \
--provisioning-model=STANDARD \
--service-account=927584127901-compute@developer.gserviceaccount.com \
--scopes=https://www.googleapis.com/auth/cloud-platform \
--accelerator=count=2,type=nvidia-tesla-a100 \
--create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
--no-shielded-secure-boot \
--shielded-vtpm \
--shielded-integrity-monitoring \
--labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \
--reservation-affinity=specific \
--reservation=projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus
--project=$GCP_PROJECT\
--zone=$ZONE_NAME \
--machine-type=$MACHINE_TYPE \
--network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
--metadata=enable-osconfig=TRUE,enable-oslogin=true \
--maintenance-policy=TERMINATE \
--provisioning-model=STANDARD \
--service-account=927584127901-compute@developer.gserviceaccount.com \
--scopes=https://www.googleapis.com/auth/cloud-platform \
--accelerator=$ACCELERATOR \
--create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
--no-shielded-secure-boot \
--shielded-vtpm \
--shielded-integrity-monitoring \
--labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \
--reservation-affinity=specific \
--reservation=$RESERVATION

echo "Wait for 30 seconds for new VM to be initialised"
sleep 30s
Expand Down Expand Up @@ -132,7 +146,7 @@ exit_status=0
# Transitions:
# START to START: If model run is not triggerred due to some error.
# START to RUNNING: If model is successfully triggerred on GPU. This state is
# changed by setup_host.sh that runs inside docker container of test VM.
# changed by setup_host.sh that runs inside docker container of test VM.
if [ $current_status == "START" ];
then
echo "Update commit Id for the run"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ curl, ca-certificates, lsb-release etc.
This script contains the instruction to install gcsfuse, mount GCS-bucket
using gcsfuse, and finally runs the pytorch dino model.

### File: perfmetrics/scripts/continuous_test/pytorch/dino/build.sh
### File: perfmetrics/scripts/continuous_test/pytorch/{v1_12 or v2}/dino/build.sh
This is the parent script of the above two scripts. Firstly, it sets-up the host
machine after that it creates the docker-image and finally it runs the container
with the inststructions written in the setup_container.sh.
Expand All @@ -40,6 +40,6 @@ log.txt - Contains the model learning parameter value after each epoch.
variable - with current working directory.
3. Create a folder named "github" and clone the gcsfuse repo in that.
4. Run the below script in the current working directory:
**source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh**
**source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/{v1_12 or v2}/dino/build.sh**
5. The above command first setups the host and then start running the model
inside container.

This file was deleted.

32 changes: 32 additions & 0 deletions perfmetrics/scripts/ml_tests/pytorch/run_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# pytorch version (e.g. v1_12, v2)
PYTORCH_VESRION=$1
cd "$HOME/github/gcsfuse"
echo "Building docker image containing all pytorch libraries..."
sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse

mkdir -p container_artifacts

echo "Running the docker image build in the previous step..."
sudo docker run --gpus all --name=pytorch_automation_container --privileged -d -v $HOME/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \
--shm-size=128g pytorch-gcsfuse:latest

# Setup the log_rotation.
source perfmetrics/scripts/ml_tests/setup_log_rotation.sh $HOME/github/gcsfuse/container_artifacts/gcsfuse.log

# Wait for the script completion as well as logs output.
sudo docker logs -f pytorch_automation_container
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
#!/bin/bash
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

PYTORCH_VESRION=$1

# Install golang
wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q
Expand Down Expand Up @@ -39,7 +54,7 @@ def pil_loader(path: str) -> Image.Image:
return rgb_img
" > bypassed_code.py

folder_file="/opt/conda/lib/python3.7/site-packages/torchvision/datasets/folder.py"
folder_file="/opt/conda/lib/python3.10/site-packages/torchvision/datasets/folder.py"
x=$(grep -n "def pil_loader(path: str) -> Image.Image:" $folder_file | cut -f1 -d ':')
y=$(grep -n "def accimage_loader(path: str) -> Any:" $folder_file | cut -f1 -d ':')
y=$((y - 2))
Expand All @@ -51,7 +66,7 @@ sed -i "$x"'r bypassed_code.py' $folder_file
# nproc_per_node - by downloading the model in single thread environment.
python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'

ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/dino"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino"
echo "Update status file"
echo "RUNNING" > status.txt
gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/
Expand All @@ -66,7 +81,7 @@ gsutil cp start_time.txt $ARTIFACTS_BUCKET_PATH/
# We need to run it in foreground mode to make the container running.
echo "Running the pytorch dino model..."
experiment=dino_experiment
python3 -m torch.distributed.launch \
torchrun \
--nproc_per_node=2 dino/main_dino.py \
--arch vit_small \
--num_workers 20 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ WORKDIR "/pytorch_dino/"

RUN git clone "https://github.com/facebookresearch/dino"

COPY perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh ./
COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./

RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENTRYPOINT ["/bin/bash", "-c", "./setup_container.sh"]
ENV PYTORCH_VERSION="v1_12"

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}]
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This will stop execution when any command will have non-zero status.
set -e

cd "$HOME/github/gcsfuse/perfmetrics/scripts"

echo "Setting up the machine with Docker and Nvidia Driver"
# Driver version for A100 GPUs is 450.172.01
DRIVER_VERSION="450.172.01"
source ml_tests/setup_host.sh $DRIVER_VERSION

PYTORCH_VERSION="v1_12"
source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION
46 changes: 46 additions & 0 deletions perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright 2023 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Image with gcsfuse installed and its package (.deb)
FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-0.py310

# Allow non-root users to specify the allow_other or allow_root mount options
RUN echo "user_allow_other" > /etc/fuse.conf

RUN pip3 install timm

WORKDIR "/pytorch_dino/"

RUN git clone "https://github.com/facebookresearch/dino"
# (TulsiShah) TODO: The current docker image does not support the dino model with compile mode.
# We can unblock the below code whenever the docker image supports the same to run.

# WORKDIR "/pytorch_dino/dino"
# RUN echo '[remote "origin"]' >> .git/config
# RUN echo ' fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config
#
# RUN git fetch origin
# RUN git diff origin/main origin/pr/262 > diff.patch
# RUN git apply diff.patch
#
# WORKDIR "/pytorch_dino/"

COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./

RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENV PYTORCH_VERSION="v2"

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"]

0 comments on commit 3a66e45

Please sign in to comment.