diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh index 2b33f53f7..0dc3ef9e3 100755 --- a/.github/container/test-pax.sh +++ b/.github/container/test-pax.sh @@ -17,12 +17,13 @@ usage() { echo " --dtype Batch size, defaults to bfloat16." echo " --enable-te If set, will run with env var ENABLE_TE=1." echo " --enable-dropout If set, will set DROPOUT_PROB to 0.1." - echo " --disable-fused-attn Whether disable TE fused attention." echo " --model-type One of 126M, 5B, LLaMA70BProxy. Defaults to 126M" echo " --evaluate Whether to test evaluation rather than training." echo " -s, --steps Number of steps to run, defaults to 500." echo " --multiprocess Enable the multiprocess GPU mode." echo " -o, --output NAME Name for the output folder, a temporary folder will be created if none specified." + echo " --save-hlo {0, 1} 1 to save the dumped hlo, 0 to remove the hlo dumped folder" + echo " --enable-fmha {0, 1} 1 to enable fmha testing, 0 to run test without fmha; default is 0" echo " --data-parallel Data parallelism to use. Defaults to 1." echo " --fsdp Fully-sharded data parallelism to use. Defaults to 1." echo " --tensor-parallel Tensor parallelism to use. Defaults to 1." @@ -32,7 +33,8 @@ usage() { exit $1 } -args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,disable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") +args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@") + if [[ $? -ne 0 ]]; then exit $1 fi @@ -55,6 +57,8 @@ NVTE_FUSED_ATTN=1 DROPOUT=0 EVALUATE=0 ADDITIONAL_ARGS="" +ENABLE_FMHA=${ENABLE_FMHA:-1} +SAVE_HLO=${SAVE_HLO:-0} eval set -- "$args" while [ : ]; do @@ -75,14 +79,15 @@ while [ : ]; do ENABLE_TE=1 shift 1 ;; + --enable-fmha) + ENABLE_FMHA="$2" + NVTE_FUSED_ATTN="$2" + shift 2 + ;; --enable-dropout) DROPOUT='0.1' shift 1 ;; - --disable-fused-attn) - NVTE_FUSED_ATTN=0 - shift 1 - ;; --model-type) MODEL_TYPE=$2 shift 2 @@ -103,6 +108,10 @@ while [ : ]; do OUTPUT=$2 shift 2 ;; + --save-hlo) + SAVE_HLO="$2" + shift 2 + ;; --data-parallel) DP="$2" shift 2 @@ -136,6 +145,21 @@ while [ : ]; do esac done +# Set hlo dump folder after output folder is set. +HLO_DIR=${OUTPUT}/hlo +export BASE_XLA_FLAGS="${BASE_XLA_FLAGS:---xla_dump_hlo_as_text --xla_dump_to=${HLO_DIR}}" +export XLA_FLAGS="${BASE_XLA_FLAGS} ${XLA_FLAGS:-}" +echo "HLO will be dumped in ${HLO_DIR} dir." + +## Setting the env variables for FMHA +if [[ "$ENABLE_FMHA" -eq "1" ]]; then + echo "Setting XLA FMHA Flags"; + export BASE_XLA_FLAGS_FMHA="${BASE_XLA_FLAGS_FMHA:---xla_gpu_fused_attention_use_cudnn_rng=true --xla_gpu_enable_cudnn_fmha=true}" + export XLA_FLAGS="${BASE_XLA_FLAGS_FMHA} ${XLA_FLAGS:-}" +fi + +echo "XLA FLAGS: $XLA_FLAGS" + # # Set derived variables GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') @@ -149,8 +173,10 @@ print_var NGPUS print_var OUTPUT print_var MULTIPROCESS print_var ENABLE_TE +print_var ENABLE_FMHA print_var NVTE_FUSED_ATTN print_var EVALUATE +print_var SAVE_HLO print_var DROPOUT print_var DP print_var FSDP @@ -422,5 +448,25 @@ else $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) fi +echo "Checking for FMHA instructions in HLO!" + +if [[ "$ENABLE_FMHA" -eq "1" ]]; then + ## Check if fmha instructions are present in the HLO dumped file or not. + fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*" + result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt) + + if [ -z "$result" ]; then + echo "E: No FMHA instructions were found in the hlo files!" + exit 1 + else + echo -e "Found FMHA instructions in the following HLO files: \n $result" + fi +fi + +if [[ $SAVE_HLO -eq 0 ]]; then + rm -rf $HLO_DIR + echo "Removed dumped HLO directory!" +fi + set +x echo "Output at ${OUTPUT}" diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 264777e15..72ce4b29c 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -252,7 +252,7 @@ jobs: - TEST_NAME: 5B_fused_attn_0 PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 2 - ADDITIONAL_ARGS: "--model-type 5B --disable-fused-attn" + ADDITIONAL_ARGS: "--model-type 5B --enable-fmha 0" - TEST_NAME: LLaMA_eval_TE PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 4 diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index c07749b12..fe0ebbaba 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -30,12 +30,18 @@ on: jobs: - single-process-multi-device: + pax-single-process-multi-device: strategy: matrix: - PARALLEL_CONFIG: - - [1, 8, 1, 1] - - [1, 1, 2, 4] + include: + - TEST_NAME: 8DP1FSDP1TP1PP + PARALLEL_CONFIG: [1, 8, 1, 1] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "--save-hlo 1" + - TEST_NAME: 8DP2FSDP4TP1PP + PARALLEL_CONFIG: [1, 1, 2, 4] + BATCH_SIZE: 4 + ADDITIONAL_ARGS: "" fail-fast: false runs-on: ubuntu-22.04 @@ -67,7 +73,7 @@ jobs: shell: bash -x -e {0} run: | IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process + TEST_CASE_NAME=${{ matrix.TEST_NAME }}_single_process MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 @@ -112,13 +118,14 @@ jobs: test-pax.sh \ --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ --dtype bfloat16 \ - --batch-per-gpu 4 \ + --batch-per-gpu ${{ matrix.BATCH_SIZE }} \ --steps 500 \ --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ - --nodes ${{ steps.meta.outputs.NODES }} + --nodes ${{ steps.meta.outputs.NODES }} \ + ${{ matrix.ADDITIONAL_ARGS }} EOF ) @@ -193,7 +200,7 @@ jobs: - TEST_NAME: 2DP1FSDP1TP4PP PARALLEL_CONFIG: [4, 2, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--save-hlo 1" - TEST_NAME: 4DP1FSDP2TP1PP PARALLEL_CONFIG: [1, 4, 1, 2] BATCH_SIZE: 4 @@ -201,7 +208,7 @@ jobs: - TEST_NAME: 16DP1FSDP1TP1PP PARALLEL_CONFIG: [1, 16, 1, 1] BATCH_SIZE: 4 - ADDITIONAL_ARGS: "" + ADDITIONAL_ARGS: "--save-hlo 1" - TEST_NAME: 2DP1FSDP2TP4PP PARALLEL_CONFIG: [4, 2, 1, 2] BATCH_SIZE: 4 @@ -209,7 +216,7 @@ jobs: PARALLEL_CONFIG: [1, 1, 8, 1] BATCH_SIZE: 4 EVALUATE: true - ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" + ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate" fail-fast: false runs-on: ubuntu-22.04 @@ -354,7 +361,7 @@ jobs: path: | output/* - single-process-evaluation: + pax-single-process-evaluation: strategy: matrix: PARALLEL_CONFIG: @@ -503,7 +510,7 @@ jobs: metrics: name: test-upstream-pax-metrics - needs: [single-process-multi-device, pax-multi-node, single-process-evaluation] + needs: [pax-single-process-multi-device, pax-multi-node, pax-single-process-evaluation] runs-on: ubuntu-22.04 steps: @@ -549,7 +556,7 @@ jobs: summary: name: test-upstream-pax-summary runs-on: ubuntu-22.04 - needs: [single-process-multi-device, pax-multi-node, single-process-evaluation] + needs: [pax-single-process-multi-device, pax-multi-node, pax-single-process-evaluation] if: "!cancelled()" steps: - name: Generate TensorBoard query URL