Updating NVIDIA A100 GPU machine for pytorch2.0 (#1610)

* adding nvidia a100 gpu machine * adding nvidia a100 gpu machine * testing changes * testing changes * undo testing changes * chainging bucket location
GoogleCloudPlatform · Jan 29, 2024 · 6664f49 · 6664f49
1 parent 4bc1607
commit 6664f49
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 19 deletions.
diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh
@@ -16,8 +16,8 @@
 # This will stop execution when any command will have non-zero status.
 set -e
 
-VM_NAME="pytorch2-dino-7d"
-ZONE_NAME="us-west1-a"
+VM_NAME="pytorch2-dino-7d-a100-gpu"
+ZONE_NAME="asia-northeast1-a"
 ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
 TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
 PYTORCH_VERSION="v2"

diff --git a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh
@@ -29,8 +29,6 @@ ARTIFACTS_BUCKET_PATH=$3
 TEST_SCRIPT_PATH=$4
 # pytorch version
 PYTORCH_VERSION=$5
-MACHINE_TYPE="a2-highgpu-2g"
-ACCELERATOR="count=2,type=nvidia-tesla-a100"
 RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"
 
 function initialize_ssh_key () {
@@ -64,24 +62,22 @@ function delete_existing_vm_and_create_new () {
   # Create NVIDIA L4 machines which are available on us-west1-1 zone.
   if [ $PYTORCH_VERSION == "v2" ];
   then
-    MACHINE_TYPE="g2-standard-24"
-    ACCELERATOR="count=2,type=nvidia-l4"
-    RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests"
+    RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-pytorch2-2gpu"
   fi
 
   echo "Creating VM $VM_NAME in zone $ZONE_NAME"
   # The below command creates VM using the reservation 'ai-ml-tests'
   sudo gcloud compute instances create $VM_NAME \
           --project=$GCP_PROJECT\
           --zone=$ZONE_NAME \
-          --machine-type=$MACHINE_TYPE \
+          --machine-type=a2-highgpu-2g\
           --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
           --metadata=enable-osconfig=TRUE,enable-oslogin=true \
           --maintenance-policy=TERMINATE \
           --provisioning-model=STANDARD \
           --service-account=927584127901-compute@developer.gserviceaccount.com \
           --scopes=https://www.googleapis.com/auth/cloud-platform \
-          --accelerator=$ACCELERATOR \
+          --accelerator=count=2,type=nvidia-tesla-a100 \
           --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20231213,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
           --no-shielded-secure-boot \
           --shielded-vtpm \

diff --git a/perfmetrics/scripts/ml_tests/pytorch/run_model.sh b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh
@@ -15,9 +15,10 @@
 
 PYTORCH_VESRION=$1
 NUM_EPOCHS=80
+TEST_BUCKET="gcsfuse-ml-data"
 
 # Install golang
-wget -O go_tar.tar.gz https://go.dev/dl/go1.21.5.linux-amd64.tar.gz -q
+wget -O go_tar.tar.gz https://go.dev/dl/go1.21.6.linux-amd64.tar.gz -q
 rm -rf /usr/local/go && tar -C /usr/local -xzf go_tar.tar.gz
 export PATH=$PATH:/usr/local/go/bin
 
@@ -30,6 +31,12 @@ cd -
 # Create a directory for gcsfuse logs
 mkdir  run_artifacts/gcsfuse_logs
 
+# We have created a bucket in the asia-northeast1 region to align with the location of our PyTorch 2.0 VM, which is also in asia-northeast1.
+if [ ${PYTORCH_VESRION} == "v2" ];
+then
+  TEST_BUCKET="gcsfuse-ml-data-asia-northeast1"
+fi
+
 config_filename=/tmp/gcsfuse_config.yaml
 cat > $config_filename << EOF
 logging:
@@ -46,13 +53,14 @@ EOF
 echo "Created config-file at "$config_filename
 
 echo "Mounting GCSFuse..."
-nohup /pytorch_dino/gcsfuse/gcsfuse --foreground \
+nohup /pytorch_dino/gcsfuse/gcsfuse --foreground --type-cache-ttl=1728000s \
+        --stat-cache-ttl=1728000s \
         --stat-cache-capacity=1320000 \
         --stackdriver-export-interval=60s \
         --implicit-dirs \
         --max-conns-per-host=100 \
         --config-file $config_filename \
-       gcsfuse-ml-data gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" &
+      $TEST_BUCKET gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" &
 
 # Update the pytorch library code to bypass the kernel-cache
 echo "Updating the pytorch library code to bypass the kernel-cache..."
@@ -81,13 +89,8 @@ python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'
 # (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599),
 # which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071).
 # We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version.
-# Reducing the epochs as pytorch2 long haul tests are running on NVIDIA L4 machines, which lack the powerful GPU of
-# the NVIDIA A100. So it is taking longer time to complete the training. We will set it back to 80 when the NVIDIA A100 GPU machine
-# will be available.
 if [ ${PYTORCH_VESRION} == "v2" ];
 then
-  NUM_EPOCHS=36
-
   allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py"
   # Update the pytorch library code to bypass the kernel-cache
   echo "Updating the pytorch library code to Disallow_in_graph distributed API.."

diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh
@@ -19,8 +19,7 @@ set -e
 cd "$HOME/github/gcsfuse/perfmetrics/scripts"
 
 echo "Setting up the machine with Docker and Nvidia Driver"
-# Driver version for L4 GPUs is 525.60.13
-DRIVER_VERSION="525.60.13"
+DRIVER_VERSION="520.61.05"
 source ml_tests/setup_host.sh $DRIVER_VERSION
 
 PYTORCH_VERSION="v2"