Skip to content

Commit

Permalink
Updating NVIDIA A100 GPU machine for pytorch2.0 (#1610)
Browse files Browse the repository at this point in the history
* adding nvidia a100 gpu machine

* adding nvidia a100 gpu machine

* testing changes

* testing changes

* undo testing changes

* chainging bucket location
  • Loading branch information
Tulsishah authored and ashmeenkaur committed Jan 29, 2024
1 parent 4bc1607 commit 6664f49
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
# This will stop execution when any command will have non-zero status.
set -e

VM_NAME="pytorch2-dino-7d"
ZONE_NAME="us-west1-a"
VM_NAME="pytorch2-dino-7d-a100-gpu"
ZONE_NAME="asia-northeast1-a"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ ARTIFACTS_BUCKET_PATH=$3
TEST_SCRIPT_PATH=$4
# pytorch version
PYTORCH_VERSION=$5
MACHINE_TYPE="a2-highgpu-2g"
ACCELERATOR="count=2,type=nvidia-tesla-a100"
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"

function initialize_ssh_key () {
Expand Down Expand Up @@ -64,24 +62,22 @@ function delete_existing_vm_and_create_new () {
# Create NVIDIA L4 machines which are available on us-west1-1 zone.
if [ $PYTORCH_VERSION == "v2" ];
then
MACHINE_TYPE="g2-standard-24"
ACCELERATOR="count=2,type=nvidia-l4"
RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests"
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-pytorch2-2gpu"
fi

echo "Creating VM $VM_NAME in zone $ZONE_NAME"
# The below command creates VM using the reservation 'ai-ml-tests'
sudo gcloud compute instances create $VM_NAME \
--project=$GCP_PROJECT\
--zone=$ZONE_NAME \
--machine-type=$MACHINE_TYPE \
--machine-type=a2-highgpu-2g\
--network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
--metadata=enable-osconfig=TRUE,enable-oslogin=true \
--maintenance-policy=TERMINATE \
--provisioning-model=STANDARD \
--service-account=927584127901-compute@developer.gserviceaccount.com \
--scopes=https://www.googleapis.com/auth/cloud-platform \
--accelerator=$ACCELERATOR \
--accelerator=count=2,type=nvidia-tesla-a100 \
--create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20231213,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
--no-shielded-secure-boot \
--shielded-vtpm \
Expand Down
19 changes: 11 additions & 8 deletions perfmetrics/scripts/ml_tests/pytorch/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

PYTORCH_VESRION=$1
NUM_EPOCHS=80
TEST_BUCKET="gcsfuse-ml-data"

# Install golang
wget -O go_tar.tar.gz https://go.dev/dl/go1.21.5.linux-amd64.tar.gz -q
wget -O go_tar.tar.gz https://go.dev/dl/go1.21.6.linux-amd64.tar.gz -q
rm -rf /usr/local/go && tar -C /usr/local -xzf go_tar.tar.gz
export PATH=$PATH:/usr/local/go/bin

Expand All @@ -30,6 +31,12 @@ cd -
# Create a directory for gcsfuse logs
mkdir run_artifacts/gcsfuse_logs

# We have created a bucket in the asia-northeast1 region to align with the location of our PyTorch 2.0 VM, which is also in asia-northeast1.
if [ ${PYTORCH_VESRION} == "v2" ];
then
TEST_BUCKET="gcsfuse-ml-data-asia-northeast1"
fi

config_filename=/tmp/gcsfuse_config.yaml
cat > $config_filename << EOF
logging:
Expand All @@ -46,13 +53,14 @@ EOF
echo "Created config-file at "$config_filename

echo "Mounting GCSFuse..."
nohup /pytorch_dino/gcsfuse/gcsfuse --foreground \
nohup /pytorch_dino/gcsfuse/gcsfuse --foreground --type-cache-ttl=1728000s \
--stat-cache-ttl=1728000s \
--stat-cache-capacity=1320000 \
--stackdriver-export-interval=60s \
--implicit-dirs \
--max-conns-per-host=100 \
--config-file $config_filename \
gcsfuse-ml-data gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" &
$TEST_BUCKET gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" &

# Update the pytorch library code to bypass the kernel-cache
echo "Updating the pytorch library code to bypass the kernel-cache..."
Expand Down Expand Up @@ -81,13 +89,8 @@ python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'
# (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599),
# which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071).
# We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version.
# Reducing the epochs as pytorch2 long haul tests are running on NVIDIA L4 machines, which lack the powerful GPU of
# the NVIDIA A100. So it is taking longer time to complete the training. We will set it back to 80 when the NVIDIA A100 GPU machine
# will be available.
if [ ${PYTORCH_VESRION} == "v2" ];
then
NUM_EPOCHS=36

allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py"
# Update the pytorch library code to bypass the kernel-cache
echo "Updating the pytorch library code to Disallow_in_graph distributed API.."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ set -e
cd "$HOME/github/gcsfuse/perfmetrics/scripts"

echo "Setting up the machine with Docker and Nvidia Driver"
# Driver version for L4 GPUs is 525.60.13
DRIVER_VERSION="525.60.13"
DRIVER_VERSION="520.61.05"
source ml_tests/setup_host.sh $DRIVER_VERSION

PYTORCH_VERSION="v2"
Expand Down

0 comments on commit 6664f49

Please sign in to comment.