Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating NVIDIA A100 GPU machine for pytorch2.0 #1610

Merged
merged 6 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
# This will stop execution when any command will have non-zero status.
set -e

VM_NAME="pytorch2-dino-7d"
ZONE_NAME="us-west1-a"
VM_NAME="pytorch2-dino-7d-a100-gpu"
ZONE_NAME="asia-northeast1-a"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ ARTIFACTS_BUCKET_PATH=$3
TEST_SCRIPT_PATH=$4
# pytorch version
PYTORCH_VERSION=$5
MACHINE_TYPE="a2-highgpu-2g"
ACCELERATOR="count=2,type=nvidia-tesla-a100"
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"

function initialize_ssh_key () {
Expand Down Expand Up @@ -64,24 +62,22 @@ function delete_existing_vm_and_create_new () {
# Create NVIDIA L4 machines which are available on us-west1-1 zone.
if [ $PYTORCH_VERSION == "v2" ];
then
MACHINE_TYPE="g2-standard-24"
ACCELERATOR="count=2,type=nvidia-l4"
RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests"
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-pytorch2-2gpu"
fi

echo "Creating VM $VM_NAME in zone $ZONE_NAME"
# The below command creates VM using the reservation 'ai-ml-tests'
sudo gcloud compute instances create $VM_NAME \
--project=$GCP_PROJECT\
--zone=$ZONE_NAME \
--machine-type=$MACHINE_TYPE \
--machine-type=a2-highgpu-2g\
--network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \
--metadata=enable-osconfig=TRUE,enable-oslogin=true \
--maintenance-policy=TERMINATE \
--provisioning-model=STANDARD \
--service-account=927584127901-compute@developer.gserviceaccount.com \
--scopes=https://www.googleapis.com/auth/cloud-platform \
--accelerator=$ACCELERATOR \
--accelerator=count=2,type=nvidia-tesla-a100 \
--create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20231213,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \
--no-shielded-secure-boot \
--shielded-vtpm \
Expand Down
14 changes: 8 additions & 6 deletions perfmetrics/scripts/ml_tests/pytorch/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

PYTORCH_VESRION=$1
NUM_EPOCHS=80
TEST_BUCKET="gcsfuse-ml-data"

# Install golang
wget -O go_tar.tar.gz https://go.dev/dl/go1.21.5.linux-amd64.tar.gz -q
Expand All @@ -30,6 +31,12 @@ cd -
# Create a directory for gcsfuse logs
mkdir run_artifacts/gcsfuse_logs

# We have created a bucket in the asia-northeast1 region to align with the location of our PyTorch 2.0 VM, which is also in asia-northeast1.
if [ ${PYTORCH_VESRION} == "v2" ];
then
TEST_BUCKET="gcsfuse-ml-data-asia-northeast1"
fi

echo "Mounting GCSFuse..."
echo "logging:
file-path: run_artifacts/gcsfuse.log
Expand All @@ -47,7 +54,7 @@ nohup /pytorch_dino/gcsfuse/gcsfuse --foreground --type-cache-ttl=1728000s \
--implicit-dirs \
--max-conns-per-host=100 \
--config-file /tmp/gcsfuse_config.yaml \
gcsfuse-ml-data gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" &
$TEST_BUCKET gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" &

# Update the pytorch library code to bypass the kernel-cache
echo "Updating the pytorch library code to bypass the kernel-cache..."
Expand Down Expand Up @@ -76,13 +83,8 @@ python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'
# (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599),
# which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071).
# We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version.
# Reducing the epochs as pytorch2 long haul tests are running on NVIDIA L4 machines, which lack the powerful GPU of
# the NVIDIA A100. So it is taking longer time to complete the training. We will set it back to 80 when the NVIDIA A100 GPU machine
# will be available.
if [ ${PYTORCH_VESRION} == "v2" ];
then
NUM_EPOCHS=36

allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py"
# Update the pytorch library code to bypass the kernel-cache
echo "Updating the pytorch library code to Disallow_in_graph distributed API.."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ set -e
cd "$HOME/github/gcsfuse/perfmetrics/scripts"

echo "Setting up the machine with Docker and Nvidia Driver"
# Driver version for L4 GPUs is 525.60.13
DRIVER_VERSION="525.60.13"
DRIVER_VERSION="520.61.05"
source ml_tests/setup_host.sh $DRIVER_VERSION

PYTORCH_VERSION="v2"
Expand Down