diff --git a/integration_test/metadata/integration_metadata.go b/integration_test/metadata/integration_metadata.go index 616cf4f7eb..f71d7ad5c4 100644 --- a/integration_test/metadata/integration_metadata.go +++ b/integration_test/metadata/integration_metadata.go @@ -95,6 +95,11 @@ type ExpectedMetricsContainer struct { ExpectedMetrics []*ExpectedMetric `yaml:"expected_metrics" validate:"onetrue=Representative,unique=Type,dive"` } +type GpuPlatform struct { + Model string `yaml:"model" validate:"required"` + Platforms []string `yaml:"platforms" validate:"required"` +} + type IntegrationMetadata struct { PublicUrl string `yaml:"public_url"` AppUrl string `yaml:"app_url" validate:"required,url"` @@ -109,7 +114,7 @@ type IntegrationMetadata struct { SupportedAppVersion []string `yaml:"supported_app_version" validate:"required,unique,min=1"` SupportedOperatingSystems string `yaml:"supported_operating_systems" validate:"required,oneof=linux windows linux_and_windows"` PlatformsToSkip []string `yaml:"platforms_to_skip"` - GpuModels []string `yaml:"gpu_models"` + GpuPlatforms []GpuPlatform `yaml:"gpu_platforms" validate:"dive"` RestartAfterInstall bool `yaml:"restart_after_install"` Troubleshoot string `yaml:"troubleshoot" validate:"excludesall=‘’“”"` diff --git a/integration_test/third_party_apps_data/applications/dcgm/centos_rhel/install b/integration_test/third_party_apps_data/applications/dcgm/centos_rhel/install new file mode 100755 index 0000000000..60d2a22a2b --- /dev/null +++ b/integration_test/third_party_apps_data/applications/dcgm/centos_rhel/install @@ -0,0 +1,38 @@ +set -e +source /etc/os-release +KERNEL_VERSION=`uname -r` +sudo yum install -y kernel-devel-${KERNEL_VERSION} pciutils gcc make wget yum-utils + +# Install the driver the same way as the nvml app +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model +DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}') +case $DEVICE_CODE in + 10de:102d) + # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version + DRIVER_VERSION=470.82.01 + ;; + *) + # Installing latest version of NVIDIA CUDA and driver + DRIVER_VERSION=535.104.05 + ;; +esac + +echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION" +curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run +sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent + +# check NVIDIA driver installation succeeded +nvidia-smi + +# Install DCGM +VERSION_ID=${VERSION_ID%%.*} +sudo yum-config-manager \ + --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo +sudo yum clean expire-cache +sudo yum install -y datacenter-gpu-manager +sudo systemctl --now enable nvidia-dcgm + +# check DCGM service running and load profiling module +dcgmi discovery --list diff --git a/integration_test/third_party_apps_data/applications/dcgm/debian_ubuntu/install b/integration_test/third_party_apps_data/applications/dcgm/debian_ubuntu/install index 82babc83dd..e19105b30c 100755 --- a/integration_test/third_party_apps_data/applications/dcgm/debian_ubuntu/install +++ b/integration_test/third_party_apps_data/applications/dcgm/debian_ubuntu/install @@ -1,37 +1,47 @@ set -e +source /etc/os-release sudo apt update -kernel_version=`uname -r` -sudo apt install -y linux-headers-${kernel_version} software-properties-common pciutils gcc make dkms +KERNEL_VERSION=`uname -r` +sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget -BASE_URL=https://us.download.nvidia.com/tesla +# Install CUDA and driver the same way as the nvml app +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}') +DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g') +# Need to add the keyring for installing CUDA and DCGM +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb +sudo dpkg -i cuda-keyring_1.0-1_all.deb case $DEVICE_CODE in 10de:102d) - # Install a specific version for NVIDIA Tesla K80 + # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version DRIVER_VERSION=470.82.01 + CUDA_VERSION=11.4.4 + echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION" + curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent + wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run + sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent ;; *) - DRIVER_VERSION=525.60.13 + echo "Installing latest version of NVIDIA CUDA and driver" + if [[ $ID == debian ]]; then + sudo add-apt-repository contrib + fi + sudo apt update + sudo apt -y install cuda ;; esac -echo "Installing NVIDIA driver version $DRIVER_VERSION" -curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run - -sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent # check NVIDIA driver installation succeeded nvidia-smi -sudo apt-get -y install wget - -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb - +# Install DCGM sudo apt-get update sudo apt-get install -y datacenter-gpu-manager -sudo service nvidia-dcgm start +sudo systemctl --now enable nvidia-dcgm # check DCGM service running and load profiling module dcgmi discovery --list -dcgmi profile --resume diff --git a/integration_test/third_party_apps_data/applications/dcgm/metadata.yaml b/integration_test/third_party_apps_data/applications/dcgm/metadata.yaml index 1da7ee3505..ac88ac1f5d 100644 --- a/integration_test/third_party_apps_data/applications/dcgm/metadata.yaml +++ b/integration_test/third_party_apps_data/applications/dcgm/metadata.yaml @@ -23,28 +23,25 @@ configure_integration: |- You must install DCGM and run the DCGM daemon service. supported_operating_systems: linux supported_app_version: ["3.1"] -platforms_to_skip: - - centos-7 - - centos-8 - - debian-10 - - debian-11 - - debian-11-arm64 - - debian-12 - - rocky-linux-8 - - rocky-linux-9 - - rocky-linux-9-arm64 - - sles-12 - - sles-15 - - sles-15-arm64 - - ubuntu-2004-lts-arm64 - - ubuntu-2204-lts - - ubuntu-2204-lts-arm64 - - ubuntu-2304-amd64 -gpu_models: # p4, k80, p100 don't support DCGM profiling metrics - - a100 - - v100 - - t4 - - l4 +gpu_platforms: # p4, k80, p100 don't emit DCGM profiling metrics + - model: a100 + platforms: + - ubuntu-2004-lts + - model: v100 + platforms: + - ubuntu-2004-lts + - model: t4 + platforms: + - ubuntu-2004-lts + - model: l4 + platforms: + - centos-7 + - debian-11 + - rocky-linux-8 + - rocky-linux-9 + - sles-15 + - ubuntu-2004-lts + - ubuntu-2204-lts expected_metrics: - type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization value_type: DOUBLE diff --git a/integration_test/third_party_apps_data/applications/dcgm/sles/install b/integration_test/third_party_apps_data/applications/dcgm/sles/install new file mode 100755 index 0000000000..ce34367eba --- /dev/null +++ b/integration_test/third_party_apps_data/applications/dcgm/sles/install @@ -0,0 +1,39 @@ +set -e + +sudo zypper --non-interactive install -y kernel-source pciutils gcc make wget + +# Install CUDA and driver the same way as the nvml app +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model +DEVICE_CODE=$(/sbin/lspci -n | grep -Po '10de:[\w\d]{4}') +DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//') +# Need to add the repo for installing CUDA and DCGM +sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo +sudo zypper --gpg-auto-import-keys --non-interactive refresh +case $DEVICE_CODE in + 10de:102d) + # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version + DRIVER_VERSION=470.82.01 + CUDA_VERSION=11.4.4 + echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION" + curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent + wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run + sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent + ;; + *) + echo "Installing latest version of NVIDIA CUDA and driver" + sudo zypper --non-interactive install -y cuda + ;; +esac + +# check NVIDIA driver installation succeeded +nvidia-smi + +# Install DCGM +sudo zypper --non-interactive install datacenter-gpu-manager +sudo systemctl --now enable nvidia-dcgm + +# check DCGM service running and load profiling module +dcgmi discovery --list diff --git a/integration_test/third_party_apps_data/applications/nvml/centos_rhel/install b/integration_test/third_party_apps_data/applications/nvml/centos_rhel/install new file mode 100755 index 0000000000..e801134228 --- /dev/null +++ b/integration_test/third_party_apps_data/applications/nvml/centos_rhel/install @@ -0,0 +1,31 @@ +set -e +KERNEL_VERSION=`uname -r` +sudo yum install -y kernel-devel-${KERNEL_VERSION} pciutils gcc make wget yum-utils + +# Install CUDA and driver together, since the `exercise` script needs to run a +# CUDA sample app to generating GPU process metrics +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model +DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}') +case $DEVICE_CODE in + 10de:102d) + # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version + DRIVER_VERSION=470.82.01 + CUDA_VERSION=11.4.4 + ;; + *) + # Installing latest version of NVIDIA CUDA and driver + DRIVER_VERSION=535.104.05 + CUDA_VERSION=12.2.2 + ;; +esac + +echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION" +curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run +sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run +sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent + +# check NVIDIA driver installation succeeded +nvidia-smi diff --git a/integration_test/third_party_apps_data/applications/nvml/debian_ubuntu/install b/integration_test/third_party_apps_data/applications/nvml/debian_ubuntu/install index c14d7aa626..fb4fa45898 100755 --- a/integration_test/third_party_apps_data/applications/nvml/debian_ubuntu/install +++ b/integration_test/third_party_apps_data/applications/nvml/debian_ubuntu/install @@ -1,12 +1,17 @@ -# Installs the application set -e +source /etc/os-release sudo apt update -kernel_version=`uname -r` -sudo apt install -y linux-headers-${kernel_version} software-properties-common pciutils gcc make dkms wget +KERNEL_VERSION=`uname -r` +sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget -# Install CUDA and driver together, since the `exercise` script needs to run a CUDA sample app to generating GPU process metrics +# Install CUDA and driver together, since the `exercise` script needs to run a +# CUDA sample app to generating GPU process metrics +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}') +DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g') case $DEVICE_CODE in 10de:102d) # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version @@ -20,8 +25,11 @@ case $DEVICE_CODE in ;; *) echo "Installing latest version of NVIDIA CUDA and driver" - wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb + wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb sudo dpkg -i cuda-keyring_1.0-1_all.deb + if [[ $ID == debian ]]; then + sudo add-apt-repository contrib + fi sudo apt update sudo apt -y install cuda ;; diff --git a/integration_test/third_party_apps_data/applications/nvml/enable b/integration_test/third_party_apps_data/applications/nvml/enable index 46f429e55b..9eb0a605a7 100755 --- a/integration_test/third_party_apps_data/applications/nvml/enable +++ b/integration_test/third_party_apps_data/applications/nvml/enable @@ -2,3 +2,6 @@ # A back up of the existing file is required as part of the test framework check. sudo cp /etc/google-cloud-ops-agent/config.yaml /etc/google-cloud-ops-agent/config.yaml.bak + +# Sleep to wait for agent to fully start before running "exercise" script +sleep 60 \ No newline at end of file diff --git a/integration_test/third_party_apps_data/applications/nvml/metadata.yaml b/integration_test/third_party_apps_data/applications/nvml/metadata.yaml index c3b3b2e5ab..d0d838a2ea 100644 --- a/integration_test/third_party_apps_data/applications/nvml/metadata.yaml +++ b/integration_test/third_party_apps_data/applications/nvml/metadata.yaml @@ -23,31 +23,35 @@ configure_integration: |- You must install the NVIDIA driver on a host with NVIDIA GPUs. supported_operating_systems: linux supported_app_version: ["515.65.01"] -platforms_to_skip: - - centos-7 - - centos-8 - - debian-10 - - debian-11 - - debian-11-arm64 - - debian-12 - - rocky-linux-8 - - rocky-linux-9 - - rocky-linux-9-arm64 - - sles-12 - - sles-15 - - sles-15-arm64 - - ubuntu-2004-lts-arm64 - - ubuntu-2204-lts - - ubuntu-2204-lts-arm64 - - ubuntu-2304-amd64 -gpu_models: - - a100 - - v100 - - p4 - - t4 - - p100 - - k80 - - l4 +gpu_platforms: + - model: a100 + platforms: + - ubuntu-2004-lts + - model: v100 + platforms: + - ubuntu-2004-lts + - model: p4 + platforms: + - ubuntu-2004-lts + - model: t4 + platforms: + - ubuntu-2004-lts + - model: p100 + platforms: + - ubuntu-2004-lts + - model: k80 + platforms: + - ubuntu-2004-lts + - model: l4 + platforms: + - centos-7 + - debian-10 + - debian-11 + - rocky-linux-8 + - rocky-linux-9 + - sles-15 + - ubuntu-2004-lts + - ubuntu-2204-lts expected_metrics: - type: agent.googleapis.com/gpu/utilization value_type: DOUBLE diff --git a/integration_test/third_party_apps_data/applications/nvml/sles/install b/integration_test/third_party_apps_data/applications/nvml/sles/install new file mode 100755 index 0000000000..857f56d99f --- /dev/null +++ b/integration_test/third_party_apps_data/applications/nvml/sles/install @@ -0,0 +1,32 @@ +set -e + +sudo zypper --non-interactive install -y kernel-source pciutils gcc make wget + +# Install CUDA and driver together, since the `exercise` script needs to run a +# CUDA sample app to generating GPU process metrics +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model +DEVICE_CODE=$(/sbin/lspci -n | grep -Po '10de:[\w\d]{4}') +DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//') +case $DEVICE_CODE in + 10de:102d) + # Install a specific version for NVIDIA Tesla K80, R470 is the last supported version + DRIVER_VERSION=470.82.01 + CUDA_VERSION=11.4.4 + echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION" + curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent + wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run + sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent + ;; + *) + echo "Installing latest version of NVIDIA CUDA and driver" + sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo + sudo zypper --gpg-auto-import-keys --non-interactive refresh + sudo zypper --non-interactive install -y cuda + ;; +esac + +# check NVIDIA driver installation succeeded +nvidia-smi diff --git a/integration_test/third_party_apps_test.go b/integration_test/third_party_apps_test.go index b634a657e2..e064f540fe 100644 --- a/integration_test/third_party_apps_test.go +++ b/integration_test/third_party_apps_test.go @@ -793,6 +793,7 @@ func determineImpactedApps(modifiedFiles []string, allApps map[string]metadata.I type accelerator struct { model string + fullName string machineType string availableZone string } @@ -822,37 +823,44 @@ var defaultApps = map[string]bool{ var gpuModels = map[string]accelerator{ // This is the A100 40G model; A100 80G is similar so skipping "a100": { - model: "nvidia-tesla-a100", + model: "a100", + fullName: "nvidia-tesla-a100", machineType: "a2-highgpu-1g", availableZone: "us-central1-a", }, "v100": { - model: "nvidia-tesla-v100", + model: "v100", + fullName: "nvidia-tesla-v100", machineType: "n1-standard-2", availableZone: "us-central1-a", }, "t4": { - model: "nvidia-tesla-t4", + model: "t4", + fullName: "nvidia-tesla-t4", machineType: "n1-standard-2", availableZone: "us-central1-a", }, "p4": { - model: "nvidia-tesla-p4", + model: "p4", + fullName: "nvidia-tesla-p4", machineType: "n1-standard-2", availableZone: "us-central1-a", }, "p100": { - model: "nvidia-tesla-p100", + model: "p100", + fullName: "nvidia-tesla-p100", machineType: "n1-standard-2", availableZone: "us-central1-c", }, "k80": { - model: "nvidia-tesla-k80", + model: "k80", + fullName: "nvidia-tesla-k80", machineType: "n1-standard-2", availableZone: "us-central1-a", }, "l4": { - model: "nvidia-l4", + model: "l4", + fullName: "nvidia-l4", machineType: "g2-standard-4", availableZone: "us-central1-a", }, @@ -902,6 +910,11 @@ func determineTestsToSkip(tests []test, impactedApps map[string]bool) { if metadata.SliceContains(test.metadata.PlatformsToSkip, test.platform) { tests[i].skipReason = "Skipping test due to 'platforms_to_skip' entry in metadata.yaml" } + for _, gpuPlatform := range test.metadata.GpuPlatforms { + if test.gpu != nil && test.gpu.model == gpuPlatform.Model && !metadata.SliceContains(gpuPlatform.Platforms, test.platform) { + tests[i].skipReason = "Skipping test due to 'gpu_platforms.platforms' entry in metadata.yaml" + } + } if reason := incompatibleOperatingSystem(test); reason != "" { tests[i].skipReason = reason } @@ -927,10 +940,10 @@ func TestThirdPartyApps(t *testing.T) { for _, platform := range platforms { for app, metadata := range allApps { - if len(metadata.GpuModels) > 0 { - for _, gpuModel := range metadata.GpuModels { - if gpu, ok := gpuModels[gpuModel]; !ok { - t.Fatalf("invalid gpu model name %s", gpuModel) + if len(metadata.GpuPlatforms) > 0 { + for _, gpuPlatform := range metadata.GpuPlatforms { + if gpu, ok := gpuModels[gpuPlatform.Model]; !ok { + t.Fatalf("invalid gpu model name %s", gpuPlatform) } else { tests = append(tests, test{platform: platform, gpu: &gpu, app: app, metadata: metadata, skipReason: ""}) } @@ -950,7 +963,7 @@ func TestThirdPartyApps(t *testing.T) { testName := tc.platform + "/" + tc.app if tc.gpu != nil { - testName = testName + "/" + tc.gpu.model + testName = testName + "/" + tc.gpu.fullName } t.Run(testName, func(t *testing.T) { @@ -975,7 +988,7 @@ func TestThirdPartyApps(t *testing.T) { if tc.gpu != nil { options.ExtraCreateArguments = append( options.ExtraCreateArguments, - fmt.Sprintf("--accelerator=count=1,type=%s", tc.gpu.model), + fmt.Sprintf("--accelerator=count=1,type=%s", tc.gpu.fullName), "--maintenance-policy=TERMINATE") options.ExtraCreateArguments = append(options.ExtraCreateArguments, "--boot-disk-size=100GB") options.MachineType = tc.gpu.machineType