diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
index 2b33f53f7..0dc3ef9e3 100755
--- a/.github/container/test-pax.sh
+++ b/.github/container/test-pax.sh
@@ -17,12 +17,13 @@ usage() {
     echo "  --dtype                    Batch size, defaults to bfloat16."
     echo "  --enable-te                If set, will run with env var ENABLE_TE=1." 
     echo "  --enable-dropout           If set, will set DROPOUT_PROB to 0.1."
-    echo "  --disable-fused-attn       Whether disable TE fused attention."
     echo "  --model-type               One of 126M, 5B, LLaMA70BProxy. Defaults to 126M"
     echo "  --evaluate                 Whether to test evaluation rather than training."
     echo "  -s, --steps                Number of steps to run, defaults to 500."
     echo "  --multiprocess             Enable the multiprocess GPU mode."
     echo "  -o, --output NAME          Name for the output folder, a temporary folder will be created if none specified."
+    echo "  --save-hlo {0, 1}          1 to save the dumped hlo, 0 to remove the hlo dumped folder"
+    echo "  --enable-fmha {0, 1}       1 to enable fmha testing, 0 to run test without fmha; default is 0"
     echo "  --data-parallel            Data parallelism to use. Defaults to 1."
     echo "  --fsdp                     Fully-sharded data parallelism to use. Defaults to 1."
     echo "  --tensor-parallel          Tensor parallelism to use. Defaults to 1."
@@ -32,7 +33,8 @@ usage() {
     exit $1
 }
 
-args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,disable-fused-attn,model-type:,evaluate,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
+args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-dropout,model-type:,enable-fmha:,evaluate,steps:,help,multiprocess,output:,save-hlo:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
+
 if [[ $? -ne 0 ]]; then
     exit $1
 fi
@@ -55,6 +57,8 @@ NVTE_FUSED_ATTN=1
 DROPOUT=0
 EVALUATE=0
 ADDITIONAL_ARGS=""
+ENABLE_FMHA=${ENABLE_FMHA:-1}
+SAVE_HLO=${SAVE_HLO:-0}
 
 eval set -- "$args"
 while [ : ]; do
@@ -75,14 +79,15 @@ while [ : ]; do
             ENABLE_TE=1
             shift 1
             ;;
+        --enable-fmha)
+            ENABLE_FMHA="$2"
+	    NVTE_FUSED_ATTN="$2"
+            shift 2
+            ;;
         --enable-dropout)
             DROPOUT='0.1'
             shift 1
             ;;
-        --disable-fused-attn)
-            NVTE_FUSED_ATTN=0
-            shift 1
-            ;;
         --model-type)
             MODEL_TYPE=$2
             shift 2
@@ -103,6 +108,10 @@ while [ : ]; do
             OUTPUT=$2
             shift 2
             ;;
+        --save-hlo)
+            SAVE_HLO="$2"
+            shift 2
+            ;;
         --data-parallel)
             DP="$2"
             shift 2
@@ -136,6 +145,21 @@ while [ : ]; do
     esac
 done
 
+# Set hlo dump folder after output folder is set.
+HLO_DIR=${OUTPUT}/hlo
+export BASE_XLA_FLAGS="${BASE_XLA_FLAGS:---xla_dump_hlo_as_text --xla_dump_to=${HLO_DIR}}"
+export XLA_FLAGS="${BASE_XLA_FLAGS} ${XLA_FLAGS:-}"
+echo "HLO will be dumped in ${HLO_DIR} dir."
+
+## Setting the env variables for FMHA
+if [[ "$ENABLE_FMHA" -eq "1" ]]; then  
+    echo "Setting XLA FMHA Flags";
+    export BASE_XLA_FLAGS_FMHA="${BASE_XLA_FLAGS_FMHA:---xla_gpu_fused_attention_use_cudnn_rng=true --xla_gpu_enable_cudnn_fmha=true}"
+    export XLA_FLAGS="${BASE_XLA_FLAGS_FMHA} ${XLA_FLAGS:-}"
+fi
+
+echo "XLA FLAGS: $XLA_FLAGS"
+
 # # Set derived variables
 
 GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
@@ -149,8 +173,10 @@ print_var NGPUS
 print_var OUTPUT
 print_var MULTIPROCESS
 print_var ENABLE_TE
+print_var ENABLE_FMHA
 print_var NVTE_FUSED_ATTN
 print_var EVALUATE
+print_var SAVE_HLO
 print_var DROPOUT
 print_var DP
 print_var FSDP
@@ -422,5 +448,25 @@ else
     $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
 fi
 
+echo "Checking for FMHA instructions in HLO!"
+
+if [[ "$ENABLE_FMHA" -eq "1" ]]; then 
+    ## Check if fmha instructions are present in the HLO dumped file or not.
+    fmha_regex="fmha[-bmm]?[-scale]?[-bias]?[-mask]?[-softmax]?[-dropout]?[-bmm]?[-backward]?*"
+    result=$(grep -irlnE "$fmha_regex" "${HLO_DIR}/"*.txt)
+
+    if [ -z "$result" ]; then
+        echo "E: No FMHA instructions were found in the hlo files!"
+	exit 1
+    else
+        echo -e "Found FMHA instructions in the following HLO files: \n $result"
+    fi
+fi
+
+if [[ $SAVE_HLO -eq 0 ]]; then
+    rm -rf $HLO_DIR
+    echo "Removed dumped HLO directory!"
+fi
+
 set +x
 echo "Output at ${OUTPUT}"
diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
index 264777e15..72ce4b29c 100644
--- a/.github/workflows/_test_pax_rosetta.yaml
+++ b/.github/workflows/_test_pax_rosetta.yaml
@@ -252,7 +252,7 @@ jobs:
           - TEST_NAME: 5B_fused_attn_0
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 2
-            ADDITIONAL_ARGS: "--model-type 5B --disable-fused-attn"
+            ADDITIONAL_ARGS: "--model-type 5B --enable-fmha 0"
           - TEST_NAME: LLaMA_eval_TE
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 4
diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml
index c07749b12..fe0ebbaba 100644
--- a/.github/workflows/_test_upstream_pax.yaml
+++ b/.github/workflows/_test_upstream_pax.yaml
@@ -30,12 +30,18 @@ on:
 
 jobs:
 
-  single-process-multi-device:
+  pax-single-process-multi-device:
     strategy:
       matrix:
-        PARALLEL_CONFIG:
-        - [1, 8, 1, 1]
-        - [1, 1, 2, 4]
+        include:
+          - TEST_NAME: 8DP1FSDP1TP1PP
+            PARALLEL_CONFIG: [1, 8, 1, 1]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: "--save-hlo 1"
+          - TEST_NAME: 8DP2FSDP4TP1PP
+            PARALLEL_CONFIG: [1, 1, 2, 4]
+            BATCH_SIZE: 4
+            ADDITIONAL_ARGS: ""
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -67,7 +73,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')"
-          TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}_single_process
           MAX_GPUS_PER_NODE=8
           NODES=1
           GPUS_PER_NODE=8
@@ -112,13 +118,14 @@ jobs:
             test-pax.sh \
               --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
               --dtype bfloat16 \
-              --batch-per-gpu 4 \
+              --batch-per-gpu ${{ matrix.BATCH_SIZE }} \
               --steps 500 \
               --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
               --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \
               --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \
               --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \
-              --nodes ${{ steps.meta.outputs.NODES }}
+              --nodes ${{ steps.meta.outputs.NODES }} \
+              ${{ matrix.ADDITIONAL_ARGS }}
           EOF
           )
 
@@ -193,7 +200,7 @@ jobs:
           - TEST_NAME: 2DP1FSDP1TP4PP
             PARALLEL_CONFIG: [4, 2, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--save-hlo 1"
           - TEST_NAME: 4DP1FSDP2TP1PP
             PARALLEL_CONFIG: [1, 4, 1, 2]
             BATCH_SIZE: 4
@@ -201,7 +208,7 @@ jobs:
           - TEST_NAME: 16DP1FSDP1TP1PP
             PARALLEL_CONFIG: [1, 16, 1, 1]
             BATCH_SIZE: 4
-            ADDITIONAL_ARGS: ""
+            ADDITIONAL_ARGS: "--save-hlo 1"   
           - TEST_NAME: 2DP1FSDP2TP4PP
             PARALLEL_CONFIG: [4, 2, 1, 2]
             BATCH_SIZE: 4
@@ -209,7 +216,7 @@ jobs:
             PARALLEL_CONFIG: [1, 1, 8, 1]
             BATCH_SIZE: 4
             EVALUATE: true
-            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
+            ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"       
       fail-fast: false
 
     runs-on: ubuntu-22.04
@@ -354,7 +361,7 @@ jobs:
           path: |
             output/*
 
-  single-process-evaluation:
+  pax-single-process-evaluation:
     strategy:
       matrix:
         PARALLEL_CONFIG:
@@ -503,7 +510,7 @@ jobs:
 
   metrics:
     name: test-upstream-pax-metrics
-    needs: [single-process-multi-device, pax-multi-node, single-process-evaluation]
+    needs: [pax-single-process-multi-device, pax-multi-node, pax-single-process-evaluation]
     runs-on: ubuntu-22.04
 
     steps:
@@ -549,7 +556,7 @@ jobs:
   summary:
     name: test-upstream-pax-summary
     runs-on: ubuntu-22.04
-    needs: [single-process-multi-device, pax-multi-node, single-process-evaluation]
+    needs: [pax-single-process-multi-device, pax-multi-node, pax-single-process-evaluation]
     if: "!cancelled()"
     steps:
       - name: Generate TensorBoard query URL