From de0bb8cf832d9cf4e2b4420a089d00f3569a05cd Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Fri, 19 Aug 2022 11:35:04 -0700 Subject: [PATCH 01/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index bdfada907cac9..d0085cf7a7c77 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -22,7 +22,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: habana-gaudi-hpus container: - image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" + image: "vault.habana.ai/gaudi-docker/1.6.0/ubuntu20.04/habanalabs/pytorch-installer-1.12.0:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all From 368505efba25d01e44843d59f99601e1e62bac46 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 14:24:38 -0700 Subject: [PATCH 02/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index d0085cf7a7c77..d37abbd9d87bc 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -65,6 +65,7 @@ jobs: - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml + cat hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' From ed12de1f539b92c03682b3894fd948985ece7506 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 15:08:09 -0700 Subject: [PATCH 03/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index d37abbd9d87bc..be645b0406e71 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -65,7 +65,7 @@ jobs: - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml - cat hpu8_test-results.xml + cat /usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' From 4b43b0f6105b8f00fb23dce557c8c06011be0a5d Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 15:19:18 -0700 Subject: [PATCH 04/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index be645b0406e71..784673879d8e3 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -57,17 +57,16 @@ jobs: hl-smi -L lsmod | grep habanalabs displayName: 'Check the driver status' - - - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Single card HPU test' - + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml - cat /usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' + + - bash: | + python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'Single card HPU test' - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ From 063010eff0813eca5ab1e84d6001ed14d1c6fd77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Aug 2022 22:20:47 +0000 Subject: [PATCH 05/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 784673879d8e3..9fa7a0b3ae2d1 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -57,12 +57,12 @@ jobs: hl-smi -L lsmod | grep habanalabs displayName: 'Check the driver status' - + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' - + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch From 4e154470a6683f78f99f8c9e6bcd135aedb4131c Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 19:27:12 -0700 Subject: [PATCH 06/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 9fa7a0b3ae2d1..0a82f679031fb 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -59,7 +59,7 @@ jobs: displayName: 'Check the driver status' - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml + python -m pytest -v accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' From 0705601cbc2a5e856a732fe0063da27396881e8e Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 19:49:45 -0700 Subject: [PATCH 07/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 0a82f679031fb..7a574dfa7d0e5 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -55,6 +55,7 @@ jobs: - bash: | hl-smi -L + ls /dev/hl* lsmod | grep habanalabs displayName: 'Check the driver status' From 3aaa345d9d1a472116600c63d2ac3d499dc9bfdf Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 20:36:33 -0700 Subject: [PATCH 08/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 7a574dfa7d0e5..2b3af30156302 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -33,6 +33,18 @@ jobs: sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" displayName: 'Install Sudo in container (thanks Microsoft!)' + - bash: | + curl -X GET https://vault.habana.ai/artifactory/api/gpg/key/public | sudo apt-key add - + echo 'deb https://vault.habana.ai/artifactory/debian focal main' | sudo tee /etc/apt/sources.list.d/artifactory.list + sudo dpkg --configure -a + sudo apt-get update + apt install - y linux-headers-5.4.0-90-generic + sudo apt-get --reinstall install -y habanalabs-firmware + sudo apt-get --reinstall install -y habanalabs-dkms + sudo modprobe -r habanalabs_en + sudo modprobe -r habanalabs + displayName: 'update driver' + - bash: | sudo apt-get install -y hwinfo hwinfo --short From 97c3ee12550161cd27ed601ce847e72a5800d4a1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Aug 2022 03:38:10 +0000 Subject: [PATCH 09/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 2b3af30156302..25761f37b220c 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -42,9 +42,9 @@ jobs: sudo apt-get --reinstall install -y habanalabs-firmware sudo apt-get --reinstall install -y habanalabs-dkms sudo modprobe -r habanalabs_en - sudo modprobe -r habanalabs + sudo modprobe -r habanalabs displayName: 'update driver' - + - bash: | sudo apt-get install -y hwinfo hwinfo --short From f1dbdf605dd6b7a89eb5f3bce23c1b7b88f017c5 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 20:39:16 -0700 Subject: [PATCH 10/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 25761f37b220c..24c261e5664a7 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -38,7 +38,7 @@ jobs: echo 'deb https://vault.habana.ai/artifactory/debian focal main' | sudo tee /etc/apt/sources.list.d/artifactory.list sudo dpkg --configure -a sudo apt-get update - apt install - y linux-headers-5.4.0-90-generic + sudo apt-get --reinstall install -y linux-headers-5.4.0-90-generic sudo apt-get --reinstall install -y habanalabs-firmware sudo apt-get --reinstall install -y habanalabs-dkms sudo modprobe -r habanalabs_en From 0a143479fe90c4266c7cec21b5fefc63eef40cda Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 20:42:43 -0700 Subject: [PATCH 11/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 24c261e5664a7..bbf5d8ae6c313 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -38,7 +38,7 @@ jobs: echo 'deb https://vault.habana.ai/artifactory/debian focal main' | sudo tee /etc/apt/sources.list.d/artifactory.list sudo dpkg --configure -a sudo apt-get update - sudo apt-get --reinstall install -y linux-headers-5.4.0-90-generic + sudo apt-get --reinstall install -y linux-headers-5.13.0-44-generic sudo apt-get --reinstall install -y habanalabs-firmware sudo apt-get --reinstall install -y habanalabs-dkms sudo modprobe -r habanalabs_en From 7d4fbb31c80885a498b32960ae006f9ccc8f31dd Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 20:47:56 -0700 Subject: [PATCH 12/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index bbf5d8ae6c313..4c09f8504f6a9 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -41,8 +41,6 @@ jobs: sudo apt-get --reinstall install -y linux-headers-5.13.0-44-generic sudo apt-get --reinstall install -y habanalabs-firmware sudo apt-get --reinstall install -y habanalabs-dkms - sudo modprobe -r habanalabs_en - sudo modprobe -r habanalabs displayName: 'update driver' - bash: | From 6891a4ab84685f34b8a3a2d01f687a0bc51a4335 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 21:00:27 -0700 Subject: [PATCH 13/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 4c09f8504f6a9..bbf5d8ae6c313 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -41,6 +41,8 @@ jobs: sudo apt-get --reinstall install -y linux-headers-5.13.0-44-generic sudo apt-get --reinstall install -y habanalabs-firmware sudo apt-get --reinstall install -y habanalabs-dkms + sudo modprobe -r habanalabs_en + sudo modprobe -r habanalabs displayName: 'update driver' - bash: | From e4c78e7665b9bdf1f1215e13b49fdca32c91f74b Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 22:17:33 -0700 Subject: [PATCH 14/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index bbf5d8ae6c313..87b9cd0c63660 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -32,19 +32,7 @@ jobs: /tmp/docker exec -t -u 0 cd-container \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" displayName: 'Install Sudo in container (thanks Microsoft!)' - - - bash: | - curl -X GET https://vault.habana.ai/artifactory/api/gpg/key/public | sudo apt-key add - - echo 'deb https://vault.habana.ai/artifactory/debian focal main' | sudo tee /etc/apt/sources.list.d/artifactory.list - sudo dpkg --configure -a - sudo apt-get update - sudo apt-get --reinstall install -y linux-headers-5.13.0-44-generic - sudo apt-get --reinstall install -y habanalabs-firmware - sudo apt-get --reinstall install -y habanalabs-dkms - sudo modprobe -r habanalabs_en - sudo modprobe -r habanalabs - displayName: 'update driver' - + - bash: | sudo apt-get install -y hwinfo hwinfo --short From 44313d7744465004ffab610026f08d284453d581 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Aug 2022 05:19:05 +0000 Subject: [PATCH 15/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 87b9cd0c63660..7a574dfa7d0e5 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -32,7 +32,7 @@ jobs: /tmp/docker exec -t -u 0 cd-container \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" displayName: 'Install Sudo in container (thanks Microsoft!)' - + - bash: | sudo apt-get install -y hwinfo hwinfo --short From 6541afe3044ad8438ae06a2b00c149a507ce62fa Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 22:20:59 -0700 Subject: [PATCH 16/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 7a574dfa7d0e5..0a82f679031fb 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -55,7 +55,6 @@ jobs: - bash: | hl-smi -L - ls /dev/hl* lsmod | grep habanalabs displayName: 'Check the driver status' From 134b0161763d92d89f45dc18432ccfeea81811e0 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Sun, 21 Aug 2022 22:38:27 -0700 Subject: [PATCH 17/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 0a82f679031fb..835d8f8cb2457 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -58,11 +58,6 @@ jobs: lsmod | grep habanalabs displayName: 'Check the driver status' - - bash: | - python -m pytest -v accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Multi card(8) HPU test' - - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch From d8349c83cc7ce85820a22ee1369cfcb130027463 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Wed, 24 Aug 2022 19:38:15 -0700 Subject: [PATCH 18/18] Update hpu-tests.yml --- .azure/hpu-tests.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 835d8f8cb2457..3c8c90ec9ec7e 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -62,6 +62,11 @@ jobs: python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' + + - bash: | + python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'Multi card(8) HPU test' - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \