From e30d1348d76b48ac077691667b0ae6afd78a449a Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 11:49:08 -0700 Subject: [PATCH 01/34] Check the driver status on gaudi server --- .azure/debug-hpu-tests.yml | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 .azure/debug-hpu-tests.yml diff --git a/.azure/debug-hpu-tests.yml b/.azure/debug-hpu-tests.yml new file mode 100644 index 0000000000000..caf95e3f4730e --- /dev/null +++ b/.azure/debug-hpu-tests.yml @@ -0,0 +1,61 @@ +# Pipeline to run the HPU tests in DL1 Instance + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + +pr: + - "master" + - "release/*" + +jobs: + - job: testing + # how long to run the job before automatically cancelling + timeoutInMinutes: "10" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + pool: habana-gaudi-hpus + container: + image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + workspace: + clean: all + + steps: + + - script: | + /tmp/docker exec -t -u 0 cd-container \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + - bash: | + sudo apt-get install -y hwinfo + hwinfo --short + python --version + sudo pip install pip -U + displayName: 'Instance HW info' + - bash: | + set -e + pip --version + sudo pip uninstall -y lightning pytorch-lightning + pip install fire + python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + pip install ".[extra,test]" + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install dependencies' + - bash: | + lspci -d 1da3: + LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12); if [ -z ${LKD} ]; then dmesg -T | tail -1; else printf "Cannot Unload LKD we need to kill the process $LKD\n" + displayName: 'Check the driver status' + - bash: | + python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'Single card HPU test' From a943f3ba125bec7c95bc69fe74c48c6ae37157e7 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 11:56:24 -0700 Subject: [PATCH 02/34] Debug the driver status on gaudi server --- .azure/hpu-tests.yml | 81 ++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 55 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index a6b00b065cc90..50adecaf05813 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -14,81 +14,52 @@ pr: - "master" - "release/*" -variables: - - name: continue - value: '1' - jobs: - job: testing # how long to run the job before automatically cancelling timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: intel-hpus + pool: habana-gaudi-hpus + container: + image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all steps: - - bash: | - CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo $CHANGED_FILES > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "##vso[task.setvariable variable=continue]0" - else - echo "Continue" - echo "##vso[task.setvariable variable=continue]1" - fi - displayName: Skipper + - script: | + /tmp/docker exec -t -u 0 cd-container \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + - bash: | - apt-get install -y hwinfo + sudo apt-get install -y hwinfo hwinfo --short + python --version + sudo pip install pip -U displayName: 'Instance HW info' - condition: eq(variables['continue'], '1') - + - bash: | - pip install -e .[extra] -r requirements/pytorch/test.txt + set -e + pip --version + sudo pip uninstall -y lightning pytorch-lightning + pip install fire + python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + pip install ".[extra,test]" + pip list env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' - condition: eq(variables['continue'], '1') - + + - bash: | + lspci -d 1da3: + LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12); if [ -z ${LKD} ]; then dmesg -T | tail -1; else printf "Cannot Unload LKD we need to kill the process $LKD\n" + displayName: 'Check the driver status' + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' - condition: eq(variables['continue'], '1') - - - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Multi card(8) HPU test' - condition: eq(variables['continue'], '1') - - - bash: | - python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ - 'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \ - 'plugins/precision/hpu/ops_fp32.txt' --forked \ - --junitxml=hpu1_precision_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'HPU precision test' - condition: eq(variables['continue'], '1') - - - bash: | - export PYTHONPATH="${PYTHONPATH}:$(pwd)" - python "pl_hpu/mnist_sample.py" - workingDirectory: examples - displayName: 'Testing: HPU examples' - condition: eq(variables['continue'], '1') - - - task: PublishTestResults@2 - inputs: - testResultsFiles: 'hpu*_test-results.xml' - testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: and(succeededOrFailed(), eq(variables['continue'], '1')) - displayName: 'Publish test results' From 047b0538b45ab6bfd0a0318753ddc7a018738a51 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Jul 2022 18:58:40 +0000 Subject: [PATCH 03/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 50adecaf05813..6b1d01797fa92 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -33,14 +33,14 @@ jobs: /tmp/docker exec -t -u 0 cd-container \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" displayName: 'Install Sudo in container (thanks Microsoft!)' - + - bash: | sudo apt-get install -y hwinfo hwinfo --short python --version sudo pip install pip -U displayName: 'Instance HW info' - + - bash: | set -e pip --version @@ -53,12 +53,12 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' - + - bash: | lspci -d 1da3: LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12); if [ -z ${LKD} ]; then dmesg -T | tail -1; else printf "Cannot Unload LKD we need to kill the process $LKD\n" displayName: 'Check the driver status' - + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch From d3d5d0f6940415dadef979b3f4bb705037a6fb5f Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 12:03:29 -0700 Subject: [PATCH 04/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 6b1d01797fa92..65d55b383ae65 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -56,7 +56,16 @@ jobs: - bash: | lspci -d 1da3: - LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12); if [ -z ${LKD} ]; then dmesg -T | tail -1; else printf "Cannot Unload LKD we need to kill the process $LKD\n" + hl-fw-loader -d 19:00.0 + hl-fw-loader -d 1a:00.0 + hl-fw-loader -d 33:00.0 + hl-fw-loader -d 34:00.0 + hl-fw-loader -d b3:00.0 + hl-fw-loader -d b4:00.0 + hl-fw-loader -d cc:00.0 + hl-fw-loader -d cd:00.0 + LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) + echo "${LKD} is the return value of lkd command" displayName: 'Check the driver status' - bash: | From ac774c816878aa77bee01efbc3459714723bbdd7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Jul 2022 19:06:35 +0000 Subject: [PATCH 05/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 65d55b383ae65..2793e92598d80 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -60,9 +60,9 @@ jobs: hl-fw-loader -d 1a:00.0 hl-fw-loader -d 33:00.0 hl-fw-loader -d 34:00.0 - hl-fw-loader -d b3:00.0 + hl-fw-loader -d b3:00.0 hl-fw-loader -d b4:00.0 - hl-fw-loader -d cc:00.0 + hl-fw-loader -d cc:00.0 hl-fw-loader -d cd:00.0 LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) echo "${LKD} is the return value of lkd command" From 1314e2d707f4ccf2b5ffbec92285ce4824bbb4e6 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 13:03:29 -0700 Subject: [PATCH 06/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 2793e92598d80..fe362e188dab8 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -56,6 +56,7 @@ jobs: - bash: | lspci -d 1da3: + ls -al /usr/sbin/ hl-fw-loader -d 19:00.0 hl-fw-loader -d 1a:00.0 hl-fw-loader -d 33:00.0 From a1f1c40b8d82cfb2338c399e29b11d4771070ba9 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 13:06:12 -0700 Subject: [PATCH 07/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index fe362e188dab8..7ddfe4367edd4 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -67,6 +67,7 @@ jobs: hl-fw-loader -d cd:00.0 LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) echo "${LKD} is the return value of lkd command" + sudo /usr/sbin/lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u displayName: 'Check the driver status' - bash: | From f96a65b4a07af6eeba87c5a67cd0ce8ba60ed8ba Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 13:56:27 -0700 Subject: [PATCH 08/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 7ddfe4367edd4..b6b45a5070ec8 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -67,7 +67,8 @@ jobs: hl-fw-loader -d cd:00.0 LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) echo "${LKD} is the return value of lkd command" - sudo /usr/sbin/lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u + sudo lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u + /usr/sbin/insmod habanalabs displayName: 'Check the driver status' - bash: | From c1933383866858c51c5a09badf0fde1aa6854924 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 14:06:04 -0700 Subject: [PATCH 09/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index b6b45a5070ec8..d16c7122f67b2 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -65,9 +65,11 @@ jobs: hl-fw-loader -d b4:00.0 hl-fw-loader -d cc:00.0 hl-fw-loader -d cd:00.0 + hl-smi -L LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) echo "${LKD} is the return value of lkd command" - sudo lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u + process=sudo lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u + echo "${process} is the list of process" /usr/sbin/insmod habanalabs displayName: 'Check the driver status' From 8e614f8e274af03544a0e02c739c4520138252c4 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 14:21:09 -0700 Subject: [PATCH 10/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index d16c7122f67b2..aca893d907834 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -57,6 +57,7 @@ jobs: - bash: | lspci -d 1da3: ls -al /usr/sbin/ + which hl-fw-loader hl-fw-loader -d 19:00.0 hl-fw-loader -d 1a:00.0 hl-fw-loader -d 33:00.0 @@ -66,6 +67,7 @@ jobs: hl-fw-loader -d cc:00.0 hl-fw-loader -d cd:00.0 hl-smi -L + modinfo habanalabs LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) echo "${LKD} is the return value of lkd command" process=sudo lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u From fc9d3b7760a255a8183920ae0886f5213c80d2e0 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 19:26:19 -0700 Subject: [PATCH 11/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index aca893d907834..635e4d01f234f 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host workspace: clean: all From 6d61dbd8f6c5090794039e445267a2a195cb3b5a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Jul 2022 02:27:51 +0000 Subject: [PATCH 12/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 635e4d01f234f..97f790acfbfaa 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host workspace: clean: all From 2350e4306ee93858e3892f3d281adf806feaee90 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 19:35:44 -0700 Subject: [PATCH 13/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 97f790acfbfaa..c0fa6fb7611bd 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host" workspace: clean: all From e65687a15658a0395084633aee1489064017a378 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 19:44:34 -0700 Subject: [PATCH 14/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index c0fa6fb7611bd..ad5cafed773c1 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host" + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all From c150e02d6d2193be3a72f56a6596264ea705affb Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 20:30:33 -0700 Subject: [PATCH 15/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index ad5cafed773c1..b17799d56ef99 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + options: "--shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro " workspace: clean: all From 13372a401f1dc2430c392b2160bcfe15106c0ace Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 20:52:46 -0700 Subject: [PATCH 16/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index b17799d56ef99..6004258a54ac9 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -72,7 +72,6 @@ jobs: echo "${LKD} is the return value of lkd command" process=sudo lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u echo "${process} is the list of process" - /usr/sbin/insmod habanalabs displayName: 'Check the driver status' - bash: | From 22dd548edc0bed6063c3abf4c9e8574a30f2ef8d Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 20:57:36 -0700 Subject: [PATCH 17/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 6004258a54ac9..45fac47c0a22d 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -78,3 +78,23 @@ jobs: python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' + + - bash: | + python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'Multi card(8) HPU test' + + - bash: | + python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ + 'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \ + 'plugins/precision/hpu/ops_fp32.txt' --forked \ + --junitxml=hpu1_precision_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'HPU precision test' + + - task: PublishTestResults@2 + inputs: + testResultsFiles: 'hpu*_test-results.xml' + testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + condition: and(succeededOrFailed(), eq(variables['continue'], '1')) + displayName: 'Publish test results' From f9857f13ba9925e9f39abcc7b216e2ff8dcfa4f8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Jul 2022 03:59:10 +0000 Subject: [PATCH 18/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 45fac47c0a22d..6e07017e79434 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -78,12 +78,12 @@ jobs: python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' - + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' - + - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ 'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \ @@ -91,7 +91,7 @@ jobs: --junitxml=hpu1_precision_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'HPU precision test' - + - task: PublishTestResults@2 inputs: testResultsFiles: 'hpu*_test-results.xml' From 24751bfea0a6326566ca6127599b52a685986e66 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 22:21:06 -0700 Subject: [PATCH 19/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 6e07017e79434..bb248f6393490 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -55,23 +55,7 @@ jobs: displayName: 'Install dependencies' - bash: | - lspci -d 1da3: - ls -al /usr/sbin/ - which hl-fw-loader - hl-fw-loader -d 19:00.0 - hl-fw-loader -d 1a:00.0 - hl-fw-loader -d 33:00.0 - hl-fw-loader -d 34:00.0 - hl-fw-loader -d b3:00.0 - hl-fw-loader -d b4:00.0 - hl-fw-loader -d cc:00.0 - hl-fw-loader -d cd:00.0 hl-smi -L - modinfo habanalabs - LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12) - echo "${LKD} is the return value of lkd command" - process=sudo lsof | grep "dev/hl" | tr -s " " | cut -d" " -f2 | sort -u - echo "${process} is the list of process" displayName: 'Check the driver status' - bash: | From ab8fc7533dec9536979df5ab357fb94899c6e313 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 22:26:50 -0700 Subject: [PATCH 20/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index bb248f6393490..dcd9483f60a1b 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -67,6 +67,7 @@ jobs: python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' + continueOnError: true - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ From 08a181677252b75e61dfcd96b3481131cf1a79cb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Jul 2022 05:28:21 +0000 Subject: [PATCH 21/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index dcd9483f60a1b..6a86a0b894340 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -67,7 +67,7 @@ jobs: python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' - continueOnError: true + continueOnError: true - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ From 2b9c02b775c7db68e6aae81890e2cffaa754c041 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Mon, 18 Jul 2022 22:38:24 -0700 Subject: [PATCH 22/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 6a86a0b894340..2f9f910700876 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -63,11 +63,6 @@ jobs: workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' - - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Multi card(8) HPU test' - continueOnError: true - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ From 921e5192675af1da5baa040b652b85f68ddf872d Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:19:52 -0700 Subject: [PATCH 23/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 2f9f910700876..5661a8b306e9e 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,8 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro " + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" + #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 workspace: clean: all @@ -56,6 +57,7 @@ jobs: - bash: | hl-smi -L + sudo apt install -y habanalabs-firmware displayName: 'Check the driver status' - bash: | From 9b6b44e696f8cbec071f4848583ef700994db895 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:20:21 -0700 Subject: [PATCH 24/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 5661a8b306e9e..b1dfd1e2e508a 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -57,7 +57,7 @@ jobs: - bash: | hl-smi -L - sudo apt install -y habanalabs-firmware + ls -al /usr/sbin/ displayName: 'Check the driver status' - bash: | From 949c82c5a49b5cf72e5f7176c174bd3b0c34e017 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:25:54 -0700 Subject: [PATCH 25/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index b1dfd1e2e508a..6bf44f884c32f 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -25,6 +25,7 @@ jobs: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 + #https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: clean: all @@ -57,7 +58,7 @@ jobs: - bash: | hl-smi -L - ls -al /usr/sbin/ + lsmod | grep habanalabs displayName: 'Check the driver status' - bash: | From 03985d3fd3bb987d9c7fc03f72b1bebde835813e Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:32:56 -0700 Subject: [PATCH 26/34] Update debug-hpu-tests.yml --- .azure/debug-hpu-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.azure/debug-hpu-tests.yml b/.azure/debug-hpu-tests.yml index caf95e3f4730e..887c2f4802aaa 100644 --- a/.azure/debug-hpu-tests.yml +++ b/.azure/debug-hpu-tests.yml @@ -23,7 +23,10 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" + #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 + #options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + # From https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: clean: all From d0282d65f7e8dc87ea8b684d069fcaa09fadb145 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:33:39 -0700 Subject: [PATCH 27/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 6bf44f884c32f..7053fecf2bec6 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -20,7 +20,7 @@ jobs: timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: habana-gaudi-hpus + pool: intel-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" From faa72dd3170c4ed159ee3dba2d32012fbd57ac5d Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:35:41 -0700 Subject: [PATCH 28/34] Delete debug-hpu-tests.yml --- .azure/debug-hpu-tests.yml | 64 -------------------------------------- 1 file changed, 64 deletions(-) delete mode 100644 .azure/debug-hpu-tests.yml diff --git a/.azure/debug-hpu-tests.yml b/.azure/debug-hpu-tests.yml deleted file mode 100644 index 887c2f4802aaa..0000000000000 --- a/.azure/debug-hpu-tests.yml +++ /dev/null @@ -1,64 +0,0 @@ -# Pipeline to run the HPU tests in DL1 Instance - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - -pr: - - "master" - - "release/*" - -jobs: - - job: testing - # how long to run the job before automatically cancelling - timeoutInMinutes: "10" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: habana-gaudi-hpus - container: - image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" - #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 - #options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" - # From https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare - workspace: - clean: all - - steps: - - - script: | - /tmp/docker exec -t -u 0 cd-container \ - sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" - displayName: 'Install Sudo in container (thanks Microsoft!)' - - bash: | - sudo apt-get install -y hwinfo - hwinfo --short - python --version - sudo pip install pip -U - displayName: 'Instance HW info' - - bash: | - set -e - pip --version - sudo pip uninstall -y lightning pytorch-lightning - pip install fire - python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext - pip install ".[extra,test]" - pip list - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - displayName: 'Install dependencies' - - bash: | - lspci -d 1da3: - LKD=$(/usr/sbin/lsmod | grep habanalabs | cut -d " " -f12); if [ -z ${LKD} ]; then dmesg -T | tail -1; else printf "Cannot Unload LKD we need to kill the process $LKD\n" - displayName: 'Check the driver status' - - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Single card HPU test' From b4d4338dc867d93170e06a3258db7e6b4ad3208f Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:40:11 -0700 Subject: [PATCH 29/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 1aeab9113c9c3..b0bd04341822e 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -20,7 +20,7 @@ jobs: timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: intel-hpus + pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" From 20ef4a16422053e15c11cabc0f6a414d97e11d2f Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:41:50 -0700 Subject: [PATCH 30/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index b0bd04341822e..012cfc78fcfbc 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container --privileged -v /dev:/dev -v /usr/bin/docker:/tmp/docker:ro" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 #https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: From d3558f543c445793472a3470051f2acfd3fc558b Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:44:41 -0700 Subject: [PATCH 31/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 012cfc78fcfbc..d62cd6bf50cbc 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host" #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 #https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: From 46817d8c11d04590fbf046e1766d3b5beb44f613 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:46:00 -0700 Subject: [PATCH 32/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index d62cd6bf50cbc..acd25d4597f6f 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host" #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 #https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: From 9e2a407c569974043a51f7a173d3ae4e9c1cae5f Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:48:04 -0700 Subject: [PATCH 33/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index acd25d4597f6f..7179040fc13d5 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 #https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: From 3990492e43708c886268ef3483f30719bbedc777 Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:54:23 -0700 Subject: [PATCH 34/34] Update hpu-tests.yml --- .azure/hpu-tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 7179040fc13d5..a0122fbe1703c 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -24,8 +24,7 @@ jobs: container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" - #docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:1.5.0-610 - #https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare + #From https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare workspace: clean: all