Pytorch 2 support (#1513)

* adding testing statements * adding scripts to run model with pytorch2.0 * small fixes and adding licence * dino model change to support pytorch 2.0 * dino model change to support pytorch 2.0 * small fix * refactoring * refactoring * refactoring * small fix * small fix * adding comments * fixing comments * fixing comments * fixing comments * removing python version * removing python version * removing python version
GoogleCloudPlatform · Nov 27, 2023 · 3a66e45 · 3a66e45
1 parent 03a98d8
commit 3a66e45
Show file tree

Hide file tree

Showing 16 changed files with 280 additions and 69 deletions.
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will stop execution when any command will have non-zero status.
+set -e
+
+VM_NAME="pytorch-dino-7d"
+ZONE_NAME="us-west1-b"
+ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino"
+TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh"
+PYTORCH_VERSION="v1_12"
+
+cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"
+
+source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg
@@ -0,0 +1,18 @@
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh"
+
+# 2 hours timeout.
+timeout_mins: 60
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will stop execution when any command will have non-zero status.
+set -e
+
+VM_NAME="pytorch2-dino-7d"
+ZONE_NAME="us-west1-a"
+ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
+TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
+PYTORCH_VERSION="v2"
+
+cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"
+
+source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg
@@ -0,0 +1,18 @@
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh"
+
+# 2 hours timeout.
+timeout_mins: 60
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh
@@ -27,6 +27,11 @@ ZONE_NAME=$2
 ARTIFACTS_BUCKET_PATH=$3
 # Path of test script relative to $HOME inside test VM.
 TEST_SCRIPT_PATH=$4
+# pytorch version
+PYTORCH_VERSION=$5
+MACHINE_TYPE="a2-highgpu-2g"
+ACCELERATOR="count=2,type=nvidia-tesla-a100"
+RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"
 
 function initialize_ssh_key () {
     echo "Delete existing ssh keys "
@@ -55,26 +60,35 @@ function delete_existing_vm_and_create_new () {
   echo "Wait for 30 seconds for old VM to be deleted"
   sleep 30s
 
+  # NVIDIA A100 40GB GPU type machine is currently unavailable due to global shortage.
+  # Create NVIDIA L4 machines which are available on us-west1-1 zone.
+  if [ $PYTORCH_VERSION == "v2" ];
+  then
+    MACHINE_TYPE="g2-standard-24"
+    ACCELERATOR="count=2,type=nvidia-l4"
+    RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests"
+  fi
+
   echo "Creating VM $VM_NAME in zone $ZONE_NAME"
   # The below command creates VM using the reservation 'ai-ml-tests'
   sudo gcloud compute instances create $VM_NAME \
-           --project=$GCP_PROJECT\
-           --zone=$ZONE_NAME \
-           --machine-type=a2-highgpu-2g \
-           --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
-           --metadata=enable-osconfig=TRUE,enable-oslogin=true \
-           --maintenance-policy=TERMINATE \
-           --provisioning-model=STANDARD \
-           --service-account=927584127901-compute@developer.gserviceaccount.com \
-           --scopes=https://www.googleapis.com/auth/cloud-platform \
-           --accelerator=count=2,type=nvidia-tesla-a100 \
-           --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
-           --no-shielded-secure-boot \
-           --shielded-vtpm \
-           --shielded-integrity-monitoring \
-           --labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \
-           --reservation-affinity=specific \
-           --reservation=projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus
+          --project=$GCP_PROJECT\
+          --zone=$ZONE_NAME \
+          --machine-type=$MACHINE_TYPE \
+          --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
+          --metadata=enable-osconfig=TRUE,enable-oslogin=true \
+          --maintenance-policy=TERMINATE \
+          --provisioning-model=STANDARD \
+          --service-account=927584127901-compute@developer.gserviceaccount.com \
+          --scopes=https://www.googleapis.com/auth/cloud-platform \
+          --accelerator=$ACCELERATOR \
+          --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
+          --no-shielded-secure-boot \
+          --shielded-vtpm \
+          --shielded-integrity-monitoring \
+          --labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \
+          --reservation-affinity=specific \
+          --reservation=$RESERVATION
 
   echo "Wait for 30 seconds for new VM to be initialised"
   sleep 30s
@@ -132,7 +146,7 @@ exit_status=0
 # Transitions:
 # START to START: If model run is not triggerred due to some error.
 # START to RUNNING: If model is successfully triggerred on GPU. This state is 
-#                   changed by setup_host.sh that runs inside docker container of test VM.
+# changed by setup_host.sh that runs inside docker container of test VM.
 if [ $current_status == "START" ];
 then
   echo "Update commit Id for the run"

diff --git a/...pts/ml_tests/pytorch/dino/README-usage.md → .../scripts/ml_tests/pytorch/README-usage.md b/...pts/ml_tests/pytorch/dino/README-usage.md → .../scripts/ml_tests/pytorch/README-usage.md
@@ -16,7 +16,7 @@ curl, ca-certificates, lsb-release etc.
 This script contains the instruction to install gcsfuse, mount GCS-bucket
 using gcsfuse, and finally runs the pytorch dino model.
 
-### File: perfmetrics/scripts/continuous_test/pytorch/dino/build.sh
+### File: perfmetrics/scripts/continuous_test/pytorch/{v1_12 or v2}/dino/build.sh
 This is the parent script of the above two scripts. Firstly, it sets-up the host
 machine after that it creates the docker-image and finally it runs the container
 with the inststructions written in the setup_container.sh.
@@ -40,6 +40,6 @@ log.txt - Contains the model learning parameter value after each epoch.
 variable - with current working directory.
 3. Create a folder named "github" and clone the gcsfuse repo in that.
 4. Run the below script in the current working directory:
-   **source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh**
+   **source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/{v1_12 or v2}/dino/build.sh**
 5. The above command first setups the host and then start running the model
 inside container.
diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh
diff --git a/perfmetrics/scripts/ml_tests/pytorch/run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+# pytorch version (e.g. v1_12, v2)
+PYTORCH_VESRION=$1
+cd "$HOME/github/gcsfuse"
+echo "Building docker image containing all pytorch libraries..."
+sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse
+
+mkdir -p container_artifacts
+
+echo "Running the docker image build in the previous step..."
+sudo docker run --gpus all --name=pytorch_automation_container --privileged -d -v $HOME/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \
+--shm-size=128g pytorch-gcsfuse:latest
+
+# Setup the log_rotation.
+source perfmetrics/scripts/ml_tests/setup_log_rotation.sh $HOME/github/gcsfuse/container_artifacts/gcsfuse.log
+
+# Wait for the script completion as well as logs output.
+sudo docker logs -f pytorch_automation_container
diff --git a/.../ml_tests/pytorch/dino/setup_container.sh → ...ics/scripts/ml_tests/pytorch/run_model.sh b/.../ml_tests/pytorch/dino/setup_container.sh → ...ics/scripts/ml_tests/pytorch/run_model.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PYTORCH_VESRION=$1
 
 # Install golang
 wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q
@@ -39,7 +54,7 @@ def pil_loader(path: str) -> Image.Image:
     return rgb_img
 " > bypassed_code.py
 
-folder_file="/opt/conda/lib/python3.7/site-packages/torchvision/datasets/folder.py"
+folder_file="/opt/conda/lib/python3.10/site-packages/torchvision/datasets/folder.py"
 x=$(grep -n "def pil_loader(path: str) -> Image.Image:" $folder_file | cut -f1 -d ':')
 y=$(grep -n "def accimage_loader(path: str) -> Any:" $folder_file | cut -f1 -d ':')
 y=$((y - 2))
@@ -51,7 +66,7 @@ sed -i "$x"'r bypassed_code.py' $folder_file
 # nproc_per_node - by downloading the model in single thread environment.
 python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'
 
-ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/dino"
+ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino"
 echo "Update status file"
 echo "RUNNING" > status.txt
 gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/
@@ -66,7 +81,7 @@ gsutil cp start_time.txt $ARTIFACTS_BUCKET_PATH/
   # We need to run it in foreground mode to make the container running.
   echo "Running the pytorch dino model..."
   experiment=dino_experiment
-  python3 -m torch.distributed.launch \
+  torchrun \
     --nproc_per_node=2 dino/main_dino.py \
     --arch vit_small \
     --num_workers 20 \

diff --git a/.../scripts/ml_tests/pytorch/dino/Dockerfile → ...ts/ml_tests/pytorch/v1_12/dino/Dockerfile b/.../scripts/ml_tests/pytorch/dino/Dockerfile → ...ts/ml_tests/pytorch/v1_12/dino/Dockerfile
@@ -24,9 +24,11 @@ WORKDIR "/pytorch_dino/"
 
 RUN git clone "https://github.com/facebookresearch/dino"
 
-COPY perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh ./
+COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./
 
 RUN mkdir -p "run_artifacts"
 RUN mkdir -p "gcsfuse_data"
 
-ENTRYPOINT ["/bin/bash", "-c", "./setup_container.sh"]
+ENV PYTORCH_VERSION="v1_12"
+
+ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}]
diff --git a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will stop execution when any command will have non-zero status.
+set -e
+
+cd "$HOME/github/gcsfuse/perfmetrics/scripts"
+
+echo "Setting up the machine with Docker and Nvidia Driver"
+# Driver version for A100 GPUs is 450.172.01
+DRIVER_VERSION="450.172.01"
+source ml_tests/setup_host.sh $DRIVER_VERSION
+
+PYTORCH_VERSION="v1_12"
+source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION
diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile
@@ -0,0 +1,46 @@
+# Copyright 2023 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Image with gcsfuse installed and its package (.deb)
+FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-0.py310
+
+# Allow non-root users to specify the allow_other or allow_root mount options
+RUN echo "user_allow_other" > /etc/fuse.conf
+
+RUN pip3 install timm
+
+WORKDIR "/pytorch_dino/"
+
+RUN git clone "https://github.com/facebookresearch/dino"
+# (TulsiShah) TODO: The current docker image does not support the dino model with compile mode.
+#       We can unblock the below code whenever the docker image supports the same to run.
+
+# WORKDIR "/pytorch_dino/dino"
+# RUN echo '[remote "origin"]' >> .git/config
+# RUN echo '    fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config
+#
+# RUN git fetch origin
+# RUN git diff origin/main origin/pr/262 > diff.patch
+# RUN git apply diff.patch
+#
+# WORKDIR "/pytorch_dino/"
+
+COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./
+
+RUN mkdir -p "run_artifacts"
+RUN mkdir -p "gcsfuse_data"
+
+ENV PYTORCH_VERSION="v2"
+
+ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"]