Skip to content

Commit

Permalink
Add GPU tests for all distros on a single GPU model (#1378)
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan committed Oct 21, 2023
1 parent 2da4b4a commit f1bbc18
Show file tree
Hide file tree
Showing 11 changed files with 262 additions and 82 deletions.
7 changes: 6 additions & 1 deletion integration_test/metadata/integration_metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ type ExpectedMetricsContainer struct {
ExpectedMetrics []*ExpectedMetric `yaml:"expected_metrics" validate:"onetrue=Representative,unique=Type,dive"`
}

type GpuPlatform struct {
Model string `yaml:"model" validate:"required"`
Platforms []string `yaml:"platforms" validate:"required"`
}

type IntegrationMetadata struct {
PublicUrl string `yaml:"public_url"`
AppUrl string `yaml:"app_url" validate:"required,url"`
Expand All @@ -109,7 +114,7 @@ type IntegrationMetadata struct {
SupportedAppVersion []string `yaml:"supported_app_version" validate:"required,unique,min=1"`
SupportedOperatingSystems string `yaml:"supported_operating_systems" validate:"required,oneof=linux windows linux_and_windows"`
PlatformsToSkip []string `yaml:"platforms_to_skip"`
GpuModels []string `yaml:"gpu_models"`
GpuPlatforms []GpuPlatform `yaml:"gpu_platforms" validate:"dive"`
RestartAfterInstall bool `yaml:"restart_after_install"`
Troubleshoot string `yaml:"troubleshoot" validate:"excludesall=‘’“”"`

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
set -e
source /etc/os-release
KERNEL_VERSION=`uname -r`
sudo yum install -y kernel-devel-${KERNEL_VERSION} pciutils gcc make wget yum-utils

# Install the driver the same way as the nvml app
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
case $DEVICE_CODE in
10de:102d)
# Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
DRIVER_VERSION=470.82.01
;;
*)
# Installing latest version of NVIDIA CUDA and driver
DRIVER_VERSION=535.104.05
;;
esac

echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent

# check NVIDIA driver installation succeeded
nvidia-smi

# Install DCGM
VERSION_ID=${VERSION_ID%%.*}
sudo yum-config-manager \
--add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
sudo yum clean expire-cache
sudo yum install -y datacenter-gpu-manager
sudo systemctl --now enable nvidia-dcgm

# check DCGM service running and load profiling module
dcgmi discovery --list
Original file line number Diff line number Diff line change
@@ -1,37 +1,47 @@
set -e
source /etc/os-release

sudo apt update
kernel_version=`uname -r`
sudo apt install -y linux-headers-${kernel_version} software-properties-common pciutils gcc make dkms
KERNEL_VERSION=`uname -r`
sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget

BASE_URL=https://us.download.nvidia.com/tesla
# Install CUDA and driver the same way as the nvml app
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
# Need to add the keyring for installing CUDA and DCGM
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
case $DEVICE_CODE in
10de:102d)
# Install a specific version for NVIDIA Tesla K80
# Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
DRIVER_VERSION=470.82.01
CUDA_VERSION=11.4.4
echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
;;
*)
DRIVER_VERSION=525.60.13
echo "Installing latest version of NVIDIA CUDA and driver"
if [[ $ID == debian ]]; then
sudo add-apt-repository contrib
fi
sudo apt update
sudo apt -y install cuda
;;
esac
echo "Installing NVIDIA driver version $DRIVER_VERSION"
curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run

sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent

# check NVIDIA driver installation succeeded
nvidia-smi

sudo apt-get -y install wget

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb

# Install DCGM
sudo apt-get update
sudo apt-get install -y datacenter-gpu-manager
sudo service nvidia-dcgm start
sudo systemctl --now enable nvidia-dcgm

# check DCGM service running and load profiling module
dcgmi discovery --list
dcgmi profile --resume
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,25 @@ configure_integration: |-
You must install DCGM and run the DCGM daemon service.
supported_operating_systems: linux
supported_app_version: ["3.1"]
platforms_to_skip:
- centos-7
- centos-8
- debian-10
- debian-11
- debian-11-arm64
- debian-12
- rocky-linux-8
- rocky-linux-9
- rocky-linux-9-arm64
- sles-12
- sles-15
- sles-15-arm64
- ubuntu-2004-lts-arm64
- ubuntu-2204-lts
- ubuntu-2204-lts-arm64
- ubuntu-2304-amd64
gpu_models: # p4, k80, p100 don't support DCGM profiling metrics
- a100
- v100
- t4
- l4
gpu_platforms: # p4, k80, p100 don't emit DCGM profiling metrics
- model: a100
platforms:
- ubuntu-2004-lts
- model: v100
platforms:
- ubuntu-2004-lts
- model: t4
platforms:
- ubuntu-2004-lts
- model: l4
platforms:
- centos-7
- debian-11
- rocky-linux-8
- rocky-linux-9
- sles-15
- ubuntu-2004-lts
- ubuntu-2204-lts
expected_metrics:
- type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization
value_type: DOUBLE
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
set -e

sudo zypper --non-interactive install -y kernel-source pciutils gcc make wget

# Install CUDA and driver the same way as the nvml app
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DEVICE_CODE=$(/sbin/lspci -n | grep -Po '10de:[\w\d]{4}')
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
# Need to add the repo for installing CUDA and DCGM
sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
sudo zypper --gpg-auto-import-keys --non-interactive refresh
case $DEVICE_CODE in
10de:102d)
# Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
DRIVER_VERSION=470.82.01
CUDA_VERSION=11.4.4
echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
;;
*)
echo "Installing latest version of NVIDIA CUDA and driver"
sudo zypper --non-interactive install -y cuda
;;
esac

# check NVIDIA driver installation succeeded
nvidia-smi

# Install DCGM
sudo zypper --non-interactive install datacenter-gpu-manager
sudo systemctl --now enable nvidia-dcgm

# check DCGM service running and load profiling module
dcgmi discovery --list
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
set -e
KERNEL_VERSION=`uname -r`
sudo yum install -y kernel-devel-${KERNEL_VERSION} pciutils gcc make wget yum-utils

# Install CUDA and driver together, since the `exercise` script needs to run a
# CUDA sample app to generating GPU process metrics
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
case $DEVICE_CODE in
10de:102d)
# Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
DRIVER_VERSION=470.82.01
CUDA_VERSION=11.4.4
;;
*)
# Installing latest version of NVIDIA CUDA and driver
DRIVER_VERSION=535.104.05
CUDA_VERSION=12.2.2
;;
esac

echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent

# check NVIDIA driver installation succeeded
nvidia-smi
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# Installs the application
set -e
source /etc/os-release

sudo apt update
kernel_version=`uname -r`
sudo apt install -y linux-headers-${kernel_version} software-properties-common pciutils gcc make dkms wget
KERNEL_VERSION=`uname -r`
sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget

# Install CUDA and driver together, since the `exercise` script needs to run a CUDA sample app to generating GPU process metrics
# Install CUDA and driver together, since the `exercise` script needs to run a
# CUDA sample app to generating GPU process metrics
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
case $DEVICE_CODE in
10de:102d)
# Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
Expand All @@ -20,8 +25,11 @@ case $DEVICE_CODE in
;;
*)
echo "Installing latest version of NVIDIA CUDA and driver"
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
if [[ $ID == debian ]]; then
sudo add-apt-repository contrib
fi
sudo apt update
sudo apt -y install cuda
;;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@

# A back up of the existing file is required as part of the test framework check.
sudo cp /etc/google-cloud-ops-agent/config.yaml /etc/google-cloud-ops-agent/config.yaml.bak

# Sleep to wait for agent to fully start before running "exercise" script
sleep 60
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,35 @@ configure_integration: |-
You must install the NVIDIA driver on a host with NVIDIA GPUs.
supported_operating_systems: linux
supported_app_version: ["515.65.01"]
platforms_to_skip:
- centos-7
- centos-8
- debian-10
- debian-11
- debian-11-arm64
- debian-12
- rocky-linux-8
- rocky-linux-9
- rocky-linux-9-arm64
- sles-12
- sles-15
- sles-15-arm64
- ubuntu-2004-lts-arm64
- ubuntu-2204-lts
- ubuntu-2204-lts-arm64
- ubuntu-2304-amd64
gpu_models:
- a100
- v100
- p4
- t4
- p100
- k80
- l4
gpu_platforms:
- model: a100
platforms:
- ubuntu-2004-lts
- model: v100
platforms:
- ubuntu-2004-lts
- model: p4
platforms:
- ubuntu-2004-lts
- model: t4
platforms:
- ubuntu-2004-lts
- model: p100
platforms:
- ubuntu-2004-lts
- model: k80
platforms:
- ubuntu-2004-lts
- model: l4
platforms:
- centos-7
- debian-10
- debian-11
- rocky-linux-8
- rocky-linux-9
- sles-15
- ubuntu-2004-lts
- ubuntu-2204-lts
expected_metrics:
- type: agent.googleapis.com/gpu/utilization
value_type: DOUBLE
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
set -e

sudo zypper --non-interactive install -y kernel-source pciutils gcc make wget

# Install CUDA and driver together, since the `exercise` script needs to run a
# CUDA sample app to generating GPU process metrics
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DEVICE_CODE=$(/sbin/lspci -n | grep -Po '10de:[\w\d]{4}')
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
case $DEVICE_CODE in
10de:102d)
# Install a specific version for NVIDIA Tesla K80, R470 is the last supported version
DRIVER_VERSION=470.82.01
CUDA_VERSION=11.4.4
echo "Installing NVIDIA CUDA $CUDA_VERSION with driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run --toolkit --silent
;;
*)
echo "Installing latest version of NVIDIA CUDA and driver"
sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
sudo zypper --gpg-auto-import-keys --non-interactive refresh
sudo zypper --non-interactive install -y cuda
;;
esac

# check NVIDIA driver installation succeeded
nvidia-smi

0 comments on commit f1bbc18

Please sign in to comment.