Add GPU tests for all distros on a single GPU model (#1378)

GoogleCloudPlatform · Oct 21, 2023 · f1bbc18 · f1bbc18
1 parent 2da4b4a
commit f1bbc18
Show file tree

Hide file tree

Showing 11 changed files with 262 additions and 82 deletions.
diff --git a/integration_test/metadata/integration_metadata.go b/integration_test/metadata/integration_metadata.go
@@ -95,6 +95,11 @@ type ExpectedMetricsContainer struct {
 	ExpectedMetrics []*ExpectedMetric `yaml:"expected_metrics" validate:"onetrue=Representative,unique=Type,dive"`
 }
 
+type GpuPlatform struct {
+	Model     string   `yaml:"model" validate:"required"`
+	Platforms []string `yaml:"platforms" validate:"required"`
+}
+
 type IntegrationMetadata struct {
 	PublicUrl                    string                        `yaml:"public_url"`
 	AppUrl                       string                        `yaml:"app_url" validate:"required,url"`
@@ -109,7 +114,7 @@ type IntegrationMetadata struct {
 	SupportedAppVersion          []string                      `yaml:"supported_app_version" validate:"required,unique,min=1"`
 	SupportedOperatingSystems    string                        `yaml:"supported_operating_systems" validate:"required,oneof=linux windows linux_and_windows"`
 	PlatformsToSkip              []string                      `yaml:"platforms_to_skip"`
-	GpuModels                    []string                      `yaml:"gpu_models"`
+	GpuPlatforms                 []GpuPlatform                 `yaml:"gpu_platforms" validate:"dive"`
 	RestartAfterInstall          bool                          `yaml:"restart_after_install"`
 	Troubleshoot                 string                        `yaml:"troubleshoot" validate:"excludesall=‘’“”"`
 

diff --git a/integration_test/third_party_apps_data/applications/dcgm/centos_rhel/install b/integration_test/third_party_apps_data/applications/dcgm/centos_rhel/install
@@ -0,0 +1,38 @@
+set -e
+source /etc/os-release
+KERNEL_VERSION=`uname -r`
+sudo yum install -y kernel-devel-${KERNEL_VERSION} pciutils gcc make wget yum-utils 
+
+# Install the driver the same way as the nvml app 
+# Prefer to install from the package manager since it is normally faster and has
+# less errors on installation; fallback to the runfile method if the package 
+# manager's package is not working or not compitible with the GPU model
+DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
+case $DEVICE_CODE in
+    10de:102d)
+        # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
+        DRIVER_VERSION=470.82.01
+        ;;
+    *)
+        # Installing latest version of NVIDIA CUDA and driver
+        DRIVER_VERSION=535.104.05
+        ;;
+esac
+
+echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
+curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
+
+# check NVIDIA driver installation succeeded
+nvidia-smi
+
+# Install DCGM
+VERSION_ID=${VERSION_ID%%.*}
+sudo yum-config-manager \
+    --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
+sudo yum clean expire-cache
+sudo yum install -y datacenter-gpu-manager
+sudo systemctl --now enable nvidia-dcgm
+
+# check DCGM service running and load profiling module
+dcgmi discovery --list
diff --git a/integration_test/third_party_apps_data/applications/dcgm/debian_ubuntu/install b/integration_test/third_party_apps_data/applications/dcgm/debian_ubuntu/install
@@ -1,37 +1,47 @@
 set -e
+source /etc/os-release
 
 sudo apt update
-kernel_version=`uname -r`
-sudo apt install -y linux-headers-${kernel_version} software-properties-common pciutils gcc make dkms
+KERNEL_VERSION=`uname -r`
+sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget
 
-BASE_URL=https://us.download.nvidia.com/tesla
+# Install CUDA and driver the same way as the nvml app 
+# Prefer to install from the package manager since it is normally faster and has
+# less errors on installation; fallback to the runfile method if the package 
+# manager's package is not working or not compitible with the GPU model
 DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
+DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
+# Need to add the keyring for installing CUDA and DCGM
+wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb
+sudo dpkg -i cuda-keyring_1.0-1_all.deb
 case $DEVICE_CODE in
     10de:102d)
-        # Install a specific version for NVIDIA Tesla K80
+        # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
         DRIVER_VERSION=470.82.01
+        CUDA_VERSION=11.4.4
+        echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
+        curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+        sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
+        wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
+        sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
         ;;
     *)
-        DRIVER_VERSION=525.60.13
+        echo "Installing latest version of NVIDIA CUDA and driver"
+        if [[ $ID == debian ]]; then
+            sudo add-apt-repository contrib
+        fi
+        sudo apt update
+        sudo apt -y install cuda 
         ;;
 esac
-echo "Installing NVIDIA driver version $DRIVER_VERSION"
-curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
-
-sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
 
 # check NVIDIA driver installation succeeded
 nvidia-smi
 
-sudo apt-get -y install wget
-
-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
-sudo dpkg -i cuda-keyring_1.0-1_all.deb
-
+# Install DCGM
 sudo apt-get update
 sudo apt-get install -y datacenter-gpu-manager
-sudo service nvidia-dcgm start
+sudo systemctl --now enable nvidia-dcgm
 
 # check DCGM service running and load profiling module
 dcgmi discovery --list
-dcgmi profile --resume
diff --git a/integration_test/third_party_apps_data/applications/dcgm/metadata.yaml b/integration_test/third_party_apps_data/applications/dcgm/metadata.yaml
@@ -23,28 +23,25 @@ configure_integration: |-
   You must install DCGM and run the DCGM daemon service. 
 supported_operating_systems: linux
 supported_app_version: ["3.1"]
-platforms_to_skip:
-  - centos-7
-  - centos-8
-  - debian-10
-  - debian-11
-  - debian-11-arm64
-  - debian-12
-  - rocky-linux-8
-  - rocky-linux-9
-  - rocky-linux-9-arm64
-  - sles-12
-  - sles-15
-  - sles-15-arm64
-  - ubuntu-2004-lts-arm64
-  - ubuntu-2204-lts
-  - ubuntu-2204-lts-arm64
-  - ubuntu-2304-amd64
-gpu_models: # p4, k80, p100 don't support DCGM profiling metrics
-  - a100
-  - v100
-  - t4
-  - l4
+gpu_platforms: # p4, k80, p100 don't emit DCGM profiling metrics
+  - model: a100
+    platforms: 
+      - ubuntu-2004-lts
+  - model: v100
+    platforms: 
+      - ubuntu-2004-lts
+  - model: t4
+    platforms: 
+      - ubuntu-2004-lts
+  - model: l4
+    platforms: 
+      - centos-7
+      - debian-11
+      - rocky-linux-8
+      - rocky-linux-9
+      - sles-15
+      - ubuntu-2004-lts
+      - ubuntu-2204-lts
 expected_metrics:
   - type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization
     value_type: DOUBLE

diff --git a/integration_test/third_party_apps_data/applications/dcgm/sles/install b/integration_test/third_party_apps_data/applications/dcgm/sles/install
@@ -0,0 +1,39 @@
+set -e
+
+sudo zypper --non-interactive install -y kernel-source pciutils gcc make wget
+
+# Install CUDA and driver the same way as the nvml app 
+# Prefer to install from the package manager since it is normally faster and has
+# less errors on installation; fallback to the runfile method if the package 
+# manager's package is not working or not compitible with the GPU model
+DEVICE_CODE=$(/sbin/lspci -n | grep -Po '10de:[\w\d]{4}')
+DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
+# Need to add the repo for installing CUDA and DCGM
+sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
+sudo zypper --gpg-auto-import-keys --non-interactive refresh
+case $DEVICE_CODE in
+    10de:102d)
+        # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
+        DRIVER_VERSION=470.82.01
+        CUDA_VERSION=11.4.4
+        echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
+        curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+        sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
+        wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
+        sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
+        ;;
+    *)
+        echo "Installing latest version of NVIDIA CUDA and driver"
+        sudo zypper --non-interactive install -y cuda
+        ;;
+esac
+
+# check NVIDIA driver installation succeeded
+nvidia-smi
+
+# Install DCGM
+sudo zypper --non-interactive install datacenter-gpu-manager
+sudo systemctl --now enable nvidia-dcgm
+
+# check DCGM service running and load profiling module
+dcgmi discovery --list
diff --git a/integration_test/third_party_apps_data/applications/nvml/centos_rhel/install b/integration_test/third_party_apps_data/applications/nvml/centos_rhel/install
@@ -0,0 +1,31 @@
+set -e
+KERNEL_VERSION=`uname -r`
+sudo yum install -y kernel-devel-${KERNEL_VERSION} pciutils gcc make wget yum-utils 
+
+# Install CUDA and driver together, since the `exercise` script needs to run a 
+# CUDA sample app to generating GPU process metrics
+# Prefer to install from the package manager since it is normally faster and has
+# less errors on installation; fallback to the runfile method if the package 
+# manager's package is not working or not compitible with the GPU model
+DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
+case $DEVICE_CODE in
+    10de:102d)
+        # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
+        DRIVER_VERSION=470.82.01
+        CUDA_VERSION=11.4.4
+        ;;
+    *)
+        # Installing latest version of NVIDIA CUDA and driver
+        DRIVER_VERSION=535.104.05
+        CUDA_VERSION=12.2.2
+        ;;
+esac
+
+echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
+curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
+wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
+sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
+
+# check NVIDIA driver installation succeeded
+nvidia-smi
diff --git a/integration_test/third_party_apps_data/applications/nvml/debian_ubuntu/install b/integration_test/third_party_apps_data/applications/nvml/debian_ubuntu/install
@@ -1,12 +1,17 @@
-# Installs the application
 set -e
+source /etc/os-release
 
 sudo apt update
-kernel_version=`uname -r`
-sudo apt install -y linux-headers-${kernel_version} software-properties-common pciutils gcc make dkms wget
+KERNEL_VERSION=`uname -r`
+sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget
 
-# Install CUDA and driver together, since the `exercise` script needs to run a CUDA sample app to generating GPU process metrics
+# Install CUDA and driver together, since the `exercise` script needs to run a 
+# CUDA sample app to generating GPU process metrics
+# Prefer to install from the package manager since it is normally faster and has
+# less errors on installation; fallback to the runfile method if the package 
+# manager's package is not working or not compitible with the GPU model
 DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
+DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
 case $DEVICE_CODE in
     10de:102d)
         # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
@@ -20,8 +25,11 @@ case $DEVICE_CODE in
         ;;
     *)
         echo "Installing latest version of NVIDIA CUDA and driver"
-        wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
+        wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb
         sudo dpkg -i cuda-keyring_1.0-1_all.deb
+        if [[ $ID == debian ]]; then
+            sudo add-apt-repository contrib
+        fi
         sudo apt update
         sudo apt -y install cuda 
         ;;

diff --git a/integration_test/third_party_apps_data/applications/nvml/enable b/integration_test/third_party_apps_data/applications/nvml/enable
@@ -2,3 +2,6 @@
 
 # A back up of the existing file is required as part of the test framework check.
 sudo cp /etc/google-cloud-ops-agent/config.yaml /etc/google-cloud-ops-agent/config.yaml.bak
+
+# Sleep to wait for agent to fully start before running "exercise" script
+sleep 60
diff --git a/integration_test/third_party_apps_data/applications/nvml/metadata.yaml b/integration_test/third_party_apps_data/applications/nvml/metadata.yaml
@@ -23,31 +23,35 @@ configure_integration: |-
   You must install the NVIDIA driver on a host with NVIDIA GPUs. 
 supported_operating_systems: linux
 supported_app_version: ["515.65.01"]
-platforms_to_skip:
-  - centos-7
-  - centos-8
-  - debian-10
-  - debian-11
-  - debian-11-arm64
-  - debian-12
-  - rocky-linux-8
-  - rocky-linux-9
-  - rocky-linux-9-arm64
-  - sles-12
-  - sles-15
-  - sles-15-arm64
-  - ubuntu-2004-lts-arm64
-  - ubuntu-2204-lts
-  - ubuntu-2204-lts-arm64
-  - ubuntu-2304-amd64
-gpu_models:
-  - a100
-  - v100
-  - p4
-  - t4
-  - p100
-  - k80
-  - l4
+gpu_platforms:
+  - model: a100
+    platforms: 
+      - ubuntu-2004-lts
+  - model: v100
+    platforms: 
+      - ubuntu-2004-lts
+  - model: p4
+    platforms: 
+      - ubuntu-2004-lts
+  - model: t4
+    platforms: 
+      - ubuntu-2004-lts
+  - model: p100
+    platforms: 
+      - ubuntu-2004-lts
+  - model: k80
+    platforms: 
+      - ubuntu-2004-lts
+  - model: l4
+    platforms: 
+      - centos-7
+      - debian-10
+      - debian-11
+      - rocky-linux-8
+      - rocky-linux-9
+      - sles-15
+      - ubuntu-2004-lts
+      - ubuntu-2204-lts
 expected_metrics:
   - type: agent.googleapis.com/gpu/utilization
     value_type: DOUBLE

diff --git a/integration_test/third_party_apps_data/applications/nvml/sles/install b/integration_test/third_party_apps_data/applications/nvml/sles/install
@@ -0,0 +1,32 @@
+set -e
+
+sudo zypper --non-interactive install -y kernel-source pciutils gcc make wget
+
+# Install CUDA and driver together, since the `exercise` script needs to run a 
+# CUDA sample app to generating GPU process metrics
+# Prefer to install from the package manager since it is normally faster and has
+# less errors on installation; fallback to the runfile method if the package 
+# manager's package is not working or not compitible with the GPU model
+DEVICE_CODE=$(/sbin/lspci -n | grep -Po '10de:[\w\d]{4}')
+DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
+case $DEVICE_CODE in
+    10de:102d)
+        # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
+        DRIVER_VERSION=470.82.01
+        CUDA_VERSION=11.4.4
+        echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
+        curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+        sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
+        wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
+        sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
+        ;;
+    *)
+        echo "Installing latest version of NVIDIA CUDA and driver"
+        sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
+        sudo zypper --gpg-auto-import-keys --non-interactive refresh
+        sudo zypper --non-interactive install -y cuda
+        ;;
+esac
+
+# check NVIDIA driver installation succeeded
+nvidia-smi